Ejemplo n.º 1
0
def start():
    # 优先使用缓存
    if not os.path.exists(args.TRAIN) or not os.path.exists(args.VALID):
        produce_data(user_define=USER_DEFINE)

    if os.path.exists(args.TRAIN_CACHE):
        train_iter, num_train_steps = torch.load(args.TRAIN_CACHE)
    else:
        train_iter, num_train_steps = create_batch_iter("train")

    if os.path.exists(args.VALID_CACHE):
        eval_iter = torch.load(args.VALID_CACHE)
    else:
        eval_iter = create_batch_iter("dev")

    epoch_size = num_train_steps * args.train_batch_size * args.gradient_accumulation_steps / args.num_train_epochs

    pbar = ProgressBar(epoch_size=epoch_size, batch_size=args.train_batch_size)

    model = Bert_CRF.from_pretrained(args.bert_model, num_tag=len(args.labels))

    for name, param in model.named_parameters():
        if param.requires_grad:
            print(name)

    fit(model=model,
        training_iter=train_iter,
        eval_iter=eval_iter,
        num_epoch=args.num_train_epochs,
        pbar=pbar,
        num_train_steps=num_train_steps,
        verbose=1)
Ejemplo n.º 2
0
def predict_k_fold():
    parser = HfArgumentParser(TrainingArguments)
    args: TrainingArguments = parser.parse_args_into_dataclasses()[0]

    logger.info(f"Training arguments: {args}")

    # Prepare devices
    device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
    args.n_gpu = torch.cuda.device_count()
    # args.n_gpu = 1

    logger.info(f"device: {device}, n_gpu: {args.n_gpu}")

    set_seed(args)

    if "_" not in args.output_dir:
        args.output_dir = args.output_dir + sorted(os.listdir(args.output_dir))[-1]  # 最新一次训练结果
        print(f"model {args.output_dir} predict useed")

    test_dataloader, examples = create_batch_iter(args, "test")

    # tokenizer = AutoTokenizer.from_pretrained(args.model)

    # bert_config = AutoConfig.from_pretrained(args.model, return_dict=True)

    model = load_model(args)
    model.to(device)

    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    model.eval()
    with torch.no_grad():
        test_logits = []
        for step, batch in enumerate(tqdm(test_dataloader, desc="test", ascii=True)):
            batch = tuple(t.to(device) for t in batch)
            input_ids, input_mask, segment_ids = batch

            outputs = model(input_ids, input_mask, segment_ids)
            logits = torch.max(outputs.logits, dim=1)[1]
            if device.type == "cuda":
                logits = logits.cpu().numpy().astype(int)
            else:
                logits = logits.numpy()
            test_logits.extend(logits.tolist())

        pred_dt = {}
        output_path = args.output_dir + "/test.csv"
        with open(output_path, "w", encoding="utf-8") as fw:
            i = 1
            for exp, label in zip(examples, test_logits):
                print(f"write line:{i}")
                i += 1
                exp: InputExample = exp
                _id = exp.label
                question = exp.text_a
                context = exp.text_b
                pred_dt[_id] = label
                fw.write(",".join([_id, str(label)]) + "\n")
        logger.info(f"output path: {output_path}")
Ejemplo n.º 3
0
def start():

    train_iter, num_train_steps = create_batch_iter("train", args.TRAIN_PATH)
    eval_iter = create_batch_iter("dev", args.VALID_PATH)

    epoch_size = num_train_steps * args.train_batch_size * args.gradient_accumulation_steps / args.num_train_epochs
    print(f'epoch_size = {epoch_size}')
    pbar = ProgressBar(epoch_size=epoch_size, batch_size=args.train_batch_size)
    model = Bert_CRF.from_pretrained(args.bert_model, num_tag=len(args.labels))
    for name, param in model.named_parameters():
        if param.requires_grad:
            print(name)

    fit(model=model,
        training_iter=train_iter,
        eval_iter=eval_iter,
        num_epoch=args.num_train_epochs,
        pbar=pbar,
        num_train_steps=num_train_steps,
        verbose=1)
Ejemplo n.º 4
0
def start():
    train_iter, num_train_steps = create_batch_iter("train")
    eval_iter = create_batch_iter("dev")

    epoch_size = num_train_steps * args.train_batch_size * args.gradient_accumulation_steps / args.num_train_epochs

    pbar = ProgressBar(epoch_size=epoch_size, batch_size=args.train_batch_size)

    model = QaExtract.from_pretrained(args.bert_model)

    for name, param in model.named_parameters():
        if param.requires_grad:
            print(name)

    fit(model=model,
        training_iter=train_iter,
        eval_iter=eval_iter,
        num_epoch=args.num_train_epochs,
        pbar=pbar,
        num_train_steps=num_train_steps,
        verbose=1)
Ejemplo n.º 5
0
def start():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--do_not_train_ernie",
        default=False,
        action='store_true',
    )
    parser.add_argument(
        "--do_CRF",
        default=False,
        action='store_true',
    )
    arg = parser.parse_args()
    args.do_not_train_ernie = arg.do_not_train_ernie
    args.do_CRF = arg.do_CRF

    produce_data()
    train_iter, num_train_steps = create_batch_iter("train")
    eval_iter = create_batch_iter("dev")

    epoch_size = num_train_steps * args.train_batch_size * args.gradient_accumulation_steps / args.num_train_epochs

    pbar = ProgressBar(epoch_size=epoch_size, batch_size=args.train_batch_size)
    if args.load_weight:
        model = load_model(args.output_dir)
    else:
        model = Bert_CRF.from_pretrained(args.bert_model,
                                         num_tag=len(args.labels))

    for name, param in model.named_parameters():
        if param.requires_grad:
            print(name)

    fit(model=model,
        training_iter=train_iter,
        eval_iter=eval_iter,
        num_epoch=args.num_train_epochs,
        pbar=pbar,
        num_train_steps=num_train_steps,
        verbose=1)
Ejemplo n.º 6
0
from Io.data_loader import create_batch_iter
from preprocessing.data_processor import produce_data
import torch
import os
import json
import config.args as args
from util.model_util import load_model

args.do_inference = True
produce_data()

test_iter = create_batch_iter("inference")
epoch_size = args.train_batch_size * args.gradient_accumulation_steps / args.num_train_epochs
model = load_model(args.output_dir)

num_epoch = args.num_train_epochs
device = torch.device(
    args.device if torch.cuda.is_available() and not args.no_cuda else "cpu")

param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']

optimizer_grouped_parameters = [{
    'params':
    [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
    'weight_decay':
    0.01
}, {
    'params':
    [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
    'weight_decay':
def main():
    parser = HfArgumentParser(TrainingArguments)
    args: TrainingArguments = parser.parse_args_into_dataclasses()[0]

    # Prepare output directory
    if not args.do_eval:
        args.output_dir = os.path.join(
            args.output_dir,
            list(filter(None,
                        args.model.strip().split("/")))[-1] + "-" +
            datetime.now().strftime("%Y%m%d_%H%M%S"))
        os.mkdir(args.output_dir)
    logger = init_logger("souhu-text-match-2021", args.output_dir)
    logger.info(f"Output dir: {args.output_dir}")

    # Prepare devices
    device = torch.device(
        "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
    args.n_gpu = torch.cuda.device_count()
    # args.n_gpu = 1

    logger.info(f"device: {device}, n_gpu: {args.n_gpu}")

    set_seed(args)

    logger.info(f"Training arguments: {args}")

    if not args.do_eval:
        train_dataloader, num_train_steps = create_batch_iter(
            args, "train", logger)
    eval_dataloader, _ = create_batch_iter(args, "dev", logger)

    tokenizer = AutoTokenizer.from_pretrained(args.model)
    # bert_config = AutoConfig.from_pretrained(args.model, return_dict=True)
    model = BertForSequenceClassification.from_pretrained(args.model)
    model.to(device)

    if args.do_eval:
        # model.eval()
        # result = do_eval(model, eval_dataloader, device, -1, -1)
        # logger.info("***** Eval results *****")
        # for key in sorted(result.keys()):
        #     logger.info("  %s = %s", key, str(result[key]))
        pass
    else:
        if args.n_gpu > 1:
            model = torch.nn.DataParallel(model)

        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.01
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        }]

        optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
        # scheduler = lr_scheduler.StepLR(optimizer, 2)
        scheduler = lr_scheduler.ReduceLROnPlateau(optimizer,
                                                   mode='min',
                                                   factor=0.1,
                                                   patience=2)
        pgd = PGD(model)
        K = 3

        # Train and evaluate
        global_step = 0
        best_dev_f1, best_epoch = float("-inf"), float("-inf")
        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")

        train_loss2plot = []
        train_acc2plot = []
        train_f1_2plot = []
        eval_loss2plot = []
        eval_acc2plot = []
        eval_f1_2plot = []
        for epoch_ in trange(int(args.num_train_epochs), desc="Epoch"):
            tr_loss = 0.
            train_logits = []
            train_labels = []

            model.train()

            for step, batch in enumerate(
                    tqdm(train_dataloader,
                         desc=f"Epoch {epoch_ + 1} iteration",
                         ascii=True)):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, labels = batch

                outputs = model(input_ids,
                                input_mask,
                                segment_ids,
                                labels=labels,
                                return_dict=True)
                train_logits.append(outputs.logits)
                train_labels.append(labels)

                loss = outputs.loss

                if args.n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                loss.backward()  # 方向传播,得到正常的grad

                if args.do_adversarial:
                    # 对抗训练
                    pgd.backup_grad()

                    for t in range(K):
                        pgd.attack(is_first_attack=(
                            t == 0
                        ))  # 在embedding上添加对抗扰动,first attack时备份param.data
                        if t != K - 1:
                            model.zero_grad()
                        else:
                            pgd.restore_grad()
                        adv_outputs = model(input_ids,
                                            input_mask,
                                            segment_ids,
                                            labels=labels,
                                            return_dict=True)
                        adv_loss = adv_outputs.loss
                        if args.n_gpu > 1:
                            adv_loss = adv_loss.mean()
                        adv_loss.backward()  # 反向传播,并在正常grad基础上,累加对抗训练的梯度
                    pgd.restore()  # 恢复embedding参数

                # 梯度下降,更新参数
                optimizer.step()
                optimizer.zero_grad()

                tr_loss += loss.item()
                global_step += 1

                if (step + 1) % args.gradient_accumulation_steps == 0:
                    pass

                if (global_step + 1) % args.eval_step == 0:
                    logger.info("***** Running evaluation *****")
                    logger.info("  Process = {} iter {} step".format(
                        epoch_, global_step))
                    logger.info("  Batch size = %d", args.eval_batch_size)
                    logger.info(
                        f"next step learning rate = {optimizer.param_groups[0]['lr']:.8f}"
                    )

                    all_train_logits = torch.cat(train_logits, dim=0).cpu()
                    all_train_labels = torch.cat(train_labels, dim=0).cpu()
                    acc, prf = evaluate(all_train_logits, all_train_labels)

                    train_loss2plot.append(loss.item())
                    train_acc2plot.append(acc)
                    train_f1_2plot.append(prf[2])

                    loss = tr_loss / (step + 1)

                    result = do_eval(args, model, eval_dataloader, device,
                                     epoch_, args.num_train_epochs, "eval",
                                     logger)
                    scheduler.step(result["eval_loss"])
                    eval_loss2plot.append(result["eval_loss"])
                    eval_acc2plot.append(result["eval_acc"])
                    eval_f1_2plot.append((result["eval_f1"]))

                    result['global_step'] = global_step
                    result['train_loss'] = loss

                    result_to_file(result, output_eval_file, logger)

                    if args.do_eval:
                        save_model = False
                    else:
                        save_model = False
                        if result['eval_f1'] > best_dev_f1:
                            best_dev_f1 = result['eval_f1']
                            best_epoch = epoch_ + 1
                            save_model = True

                    if save_model:
                        logger.info("***** Save model *****")
                        best_model = model
                        model_to_save = model.module if hasattr(
                            best_model, 'module') else best_model

                        output_model_file = os.path.join(
                            args.output_dir, "pytorch_model.bin")
                        output_config_file = os.path.join(
                            args.output_dir, "config.json")

                        torch.save(model_to_save.state_dict(),
                                   output_model_file)
                        model_to_save.config.to_json_file(output_config_file)
                        tokenizer.save_vocabulary(args.output_dir)

        logger.info(
            f"best epoch: {best_epoch}, best eval f1:{best_dev_f1:.4f}")

        loss_acc_plot([
            train_loss2plot, train_acc2plot, train_f1_2plot, eval_loss2plot,
            eval_acc2plot, eval_f1_2plot
        ], os.path.join(args.output_dir, "loss_acc_f1.png"))
        logger.info(f"output dir: {args.output_dir}")
Ejemplo n.º 8
0
def predict():
    parser = HfArgumentParser(TrainingArguments)
    args: TrainingArguments = parser.parse_args_into_dataclasses()[0]

    logger = init_logger("souhu-text-match-2021", "output/logs/")
    logger.info(f"!!!!!!Test arguments: {args}")

    # Prepare devices
    device = torch.device(
        "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
    args.n_gpu = torch.cuda.device_count()
    # args.n_gpu = 1

    logger.info(f"device: {device}, n_gpu: {args.n_gpu}")

    set_seed(args)
    test_dataloader = create_batch_iter(args, "test", logger)

    args.output_dir = args.output_dir + sorted(os.listdir(
        args.output_dir))[-1]  # 最新一次训练结果
    logger.info(f"model {args.output_dir} predict useed")

    tokenizer = RoFormerTokenizer.from_pretrained(
        "/home/zhuminghao/work/model/pt/chinese_roformer_base")  # 没保存,所以用原始一样
    model = RoFormerForSequenceClassification.from_pretrained(args.output_dir)
    model.to(device)

    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    model.eval()
    with torch.no_grad():
        test_logits = []
        ids = []
        for step, batch in enumerate(
                tqdm(test_dataloader, desc="test", ascii=True)):
            sources, targets, bt_ids = batch
            inputs = list(zip(sources, targets))
            ids.append(bt_ids)

            pt_batch = tokenizer(inputs,
                                 padding=True,
                                 truncation="longest_first",
                                 max_length=args.max_seq_length,
                                 return_tensors="pt")
            pt_batch = pt_batch.to(device)

            outputs = model(**pt_batch, return_dict=True)

            logits = torch.max(outputs.logits, dim=1)[1]
            if device.type == "cuda":
                logits = logits.cpu().numpy().astype(int)
            else:
                logits = logits.numpy()
            test_logits.extend(logits.tolist())

        output_path = args.output_dir + "/test.csv"
        with open(output_path, "w", encoding="utf-8") as fw:
            for id, label in zip(ids, test_logits):
                fw.write(",".join([id, str(label)]) + "\n")
        logger.info(f"output path: {output_path}")