def _infer(data):
     model.eval()
     # data = dev_data
     y_pred = []
     with torch.no_grad():
         for batch_data in data_iter(data, test_batch_size, shuffle=False):
             torch.cuda.empty_cache()
             batch_inputs, batch_labels = batch2tensor(batch_data)
             batch_outputs = model(batch_inputs)
             y_pred.extend(torch.max(batch_outputs, dim=1)
                           [1].cpu().numpy().tolist())
             print(label_encoder.label2name(y_pred))
    def _eval(data):
        model.eval()  # 不启用 BatchNormalization 和 Dropout
        # data = dev_data
        y_pred = []
        y_true = []
        with torch.no_grad():
            for batch_data in data_iter(data, test_batch_size, shuffle=False):
                torch.cuda.empty_cache()
                batch_inputs, batch_labels = batch2tensor(batch_data)
                batch_outputs = model(batch_inputs)
                y_pred.extend(torch.max(batch_outputs, dim=1)
                              [1].cpu().numpy().tolist())
                y_true.extend(batch_labels.cpu().numpy().tolist())

            score, dev_f1 = get_score(y_true, y_pred)
        return score, dev_f1
def run(method="train", save_path=None, infer_texts=[]):
    shuffle_slicer = ShuffleSlicer()
    # start_time = time.time()

    raw_data_path = "/home/wujinjie/kesci_question_multilabel_classification/data/raw_data/baidu/nlp_db.baidu_text.csv"
    texts = pd.read_csv(raw_data_path)
    train_df, dev_df, test_df = shuffle_slicer.split(texts, dev=True)

    # Test_data = {'label': [0] * len(texts), 'text': test_texts}

    clip = 5.0
    epochs = 100
    # log_interval = 50
    test_batch_size = 128
    train_batch_size = 128
    train_texts, train_labels = process_corpus_dl(train_df)
    Train_data = {'label': train_labels, 'text': train_texts}

    dev_texts, dev_labels = process_corpus_dl(dev_df)
    Dev_data = {'label': dev_labels, 'text': dev_texts}
    vocab = Vocab(Train_data)
    step = 0

    def _eval(data):
        model.eval()  # 不启用 BatchNormalization 和 Dropout
        # data = dev_data
        y_pred = []
        y_true = []
        with torch.no_grad():
            for batch_data in data_iter(data, test_batch_size, shuffle=False):
                torch.cuda.empty_cache()
                batch_inputs, batch_labels = batch2tensor(batch_data)
                batch_outputs = model(batch_inputs)
                y_pred.extend(torch.max(batch_outputs, dim=1)
                              [1].cpu().numpy().tolist())
                y_true.extend(batch_labels.cpu().numpy().tolist())

            score, dev_f1 = get_score(y_true, y_pred)
        return score, dev_f1

    def _infer(data):
        model.eval()
        # data = dev_data
        y_pred = []
        with torch.no_grad():
            for batch_data in data_iter(data, test_batch_size, shuffle=False):
                torch.cuda.empty_cache()
                batch_inputs, batch_labels = batch2tensor(batch_data)
                batch_outputs = model(batch_inputs)
                y_pred.extend(torch.max(batch_outputs, dim=1)
                              [1].cpu().numpy().tolist())
                print(label_encoder.label2name(y_pred))

    if method == "train":
        model = Model(vocab, label_encoder)
        # loss
        criterion = nn.CrossEntropyLoss()  # obj

        # 生成模型可处理的格式
        train_data = get_examples_bert(Train_data, model.word_encoder, vocab, label_encoder)
        dev_data = get_examples_bert(Dev_data, model.word_encoder, vocab, label_encoder)
        # 一个epoch的batch个数
        batch_num = int(np.ceil(len(train_data) / float(train_batch_size)))
        optimizer = Optimizer(model.all_parameters, steps=batch_num * epochs)  # 优化器
        best_train_f1, best_dev_f1 = 0, 0
        early_stop = -1
        EarlyStopEpochs = 3  # 当多个epoch,dev的指标都没有提升,则早停
        # train
        print("start train")
        for epoch in range(1, epochs + 1):
            optimizer.zero_grad()
            model.train()  # 启用 BatchNormalization 和 Dropout
            overall_losses = 0
            losses = 0
            # batch_idx = 1
            y_pred = []
            y_true = []
            for batch_data in data_iter(train_data, train_batch_size, shuffle=True):
                torch.cuda.empty_cache()
                batch_inputs, batch_labels = batch2tensor(batch_data)
                batch_outputs = model(batch_inputs)
                loss = criterion(batch_outputs, batch_labels)
                loss.backward()

                loss_value = loss.detach().cpu().item()
                losses += loss_value
                overall_losses += loss_value

                y_pred.extend(torch.max(batch_outputs, dim=1)
                              [1].cpu().numpy().tolist())
                y_true.extend(batch_labels.cpu().numpy().tolist())

                nn.utils.clip_grad_norm_(
                    optimizer.all_params, max_norm=clip)  # 梯度裁剪
                for cur_optim, scheduler in zip(optimizer.optims, optimizer.schedulers):
                    cur_optim.step()
                    scheduler.step()
                optimizer.zero_grad()
                step += 1
                # print(step)
            print(epoch)
            overall_losses /= batch_num
            overall_losses = reformat(overall_losses, 4)
            score, train_f1 = get_score(y_true, y_pred)
            print("score:{}, train_f1:{}".format(train_f1, score))
            # if set(y_true) == set(y_pred):
            #     print("report")
            #     report = classification_report(y_true, y_pred, digits=4, target_names=label_encoder.target_names)
            #     # logging.info('\n' + report)
            #     print(report)

            # eval
            _, dev_f1 = _eval(data=dev_data)

            if best_dev_f1 <= dev_f1:
                best_dev_f1 = dev_f1
                early_stop = 0
                best_train_f1 = train_f1
                save_path = model_utils.save_checkpoint(
                    model, epoch, save_folder="/home/wujinjie/kesci_question_multilabel_classification/data/textbert")
                print("save_path:{}".format(save_path))
                # torch.save(model.state_dict(), save_model)
            else:
                early_stop += 1
                if early_stop == EarlyStopEpochs:  # 达到早停次数,则停止训练
                    break

            print("score:{}, dev_f1:{}, best_train_f1:{}, best_dev_f1:{}".format(
                dev_f1, score, best_train_f1, best_dev_f1))
    else:
        model = model_utils.load_checkpoint(save_path)
        if method == "test":
            test_texts, test_labels = process_corpus_dl(train_df)
            Test_data = {'label': test_labels, 'text': test_texts}
            test_data = get_examples_bert(Test_data, model.word_encoder, vocab, label_encoder)
            # model.load_state_dict(torch.load(save_model))
            _, dev_f1 = _eval(data=test_data)
            print(dev_f1)

        elif method == "infer":
            infer_texts = list(map(segment, infer_texts))
            # print(infer_texts)
            Infer_data = {'label': [0] * len(infer_texts), 'text': infer_texts}
            infer_data = get_examples_bert(Infer_data, model.word_encoder, vocab, label_encoder)
            _infer(data=infer_data)
Exemple #4
0
batch_num = int(np.ceil(len(train_data) / float(train_batch_size)))  # 一个epoch的batch个数
if __name__ == "__main__":
    best_train_f1, best_dev_f1 = 0, 0
    early_stop = -1
    EarlyStopEpochs = 3  # 当多个epoch,dev的指标都没有提升,则早停
    # train
    print("start train")
    for epoch in range(1, epochs + 1):
        optimizer.zero_grad()
        model.train()  # 启用 BatchNormalization 和 Dropout
        overall_losses = 0
        losses = 0
        batch_idx = 1
        y_pred = []
        y_true = []
        for batch_data in data_iter(train_data, train_batch_size, shuffle=True):
            torch.cuda.empty_cache()
            batch_inputs, batch_labels = batch2tensor(batch_data)
            batch_outputs = model(batch_inputs)
            loss = criterion(batch_outputs, batch_labels)
            loss.backward()

            loss_value = loss.detach().cpu().item()
            losses += loss_value
            overall_losses += loss_value

            y_pred.extend(torch.max(batch_outputs, dim=1)[1].cpu().numpy().tolist())
            y_true.extend(batch_labels.cpu().numpy().tolist())

            nn.utils.clip_grad_norm_(optimizer.all_params, max_norm=clip)  # 梯度裁剪
            for cur_optim, scheduler in zip(optimizer.optims, optimizer.schedulers):