Beispiel #1
0
        mask1 = (sent1 == 1)
        mask2 = (sent2 == 1)
        y = batch.label
        y = y.to(device)
        y_hat = net(sent1.to(device), sent2.to(device), mask1.to(device),
                    mask2.to(device))
        acc_sum += (y_hat.argmax(dim=1) == y).sum().cpu().item()
        n += y.shape[0]
        batch_num += 1
        loss_sum += crition(y_hat, y).cpu().item()
    net.train()
    return acc_sum / n, loss_sum / batch_num


# 训练
net.train()
print(f'training on : {device}')
best_acc = 0.5
for epoch in range(args.num_epochs):
    start = time.time()
    train_l_sum, train_acc_sum, n, batch_num = 0.0, 0.0, 0, 0
    for batch in train_iter:
        sent1, sent2 = batch.sentence1[0], batch.sentence2[0]
        mask1 = (sent1 == 1)
        mask2 = (sent2 == 1)
        y = batch.label
        y = y.to(device)
        y_hat = net(sent1.to(device), sent2.to(device), mask1.to(device),
                    mask2.to(device))
        l = crition(y_hat, y)
        optimizer.zero_grad()
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The input data dir. Should contain the .tsv files (or other data files) for the task."
    )
    parser.add_argument("--embeddings_file",
                        default=None,
                        type=str,
                        required=True)
    parser.add_argument("--output_dir", default=None, type=str, required=True)
    parser.add_argument("--train_language",
                        default=None,
                        type=str,
                        required=True)
    parser.add_argument("--train_steps", default=-1, type=int, required=True)
    parser.add_argument("--eval_steps", default=-1, type=int, required=True)
    parser.add_argument(
        "--load_word2vec",
        action='store_true',
        help=
        'if true, load word2vec file for the first time; if false, load generated word-vector csv file'
    )
    parser.add_argument("--generate_word2vec_csv",
                        action='store_true',
                        help='if true, generate word2vec csv file')
    ## normal parameters
    parser.add_argument("--embedding_size", default=300, type=int)
    parser.add_argument("--query_maxlen", default=30, type=int)
    parser.add_argument("--hidden_size", default=300, type=int)
    parser.add_argument("--learning_rate",
                        default=5e-4,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_classes", default=2, type=int)
    parser.add_argument("--dropout", default=0.2, type=float)
    parser.add_argument("--do_test",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument("--do_eval_train",
                        action='store_true',
                        help="Whether to run eval on the train set.")
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--per_gpu_eval_batch_size", default=10, type=int)
    parser.add_argument("--per_gpu_train_batch_size", default=10, type=int)
    parser.add_argument("--seed", default=1, type=int)
    parser.add_argument("--adam_epsilon",
                        default=1e-8,
                        type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument("--gradient_accumulation_steps", default=1, type=int)

    args = parser.parse_args()

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    args.n_gpu = torch.cuda.device_count()
    # device = torch.device("cpu")
    args.device = device

    # Set seed
    set_seed(args)

    logger.info("Training/evaluation parameters %s", args)
    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)

    # Training
    if args.do_train:
        # build model
        logger.info("*** building model ***")
        embeddings = load_embeddings(args)
        model = ESIM(args.hidden_size,
                     embeddings=embeddings,
                     dropout=args.dropout,
                     num_classes=args.num_classes,
                     device=args.device)
        model.to(args.device)

        if args.n_gpu > 1:
            model = torch.nn.DataParallel(model)

        args.train_batch_size = args.per_gpu_train_batch_size * max(
            1, args.n_gpu)

        logger.info("*** Loading training data ***")
        train_data = ATEC_Dataset(os.path.join(args.data_dir, 'train.csv'),
                                  os.path.join(args.data_dir, 'vocab.csv'),
                                  args.query_maxlen)
        train_loader = DataLoader(train_data,
                                  shuffle=True,
                                  batch_size=args.train_batch_size)

        logger.info("*** Loading validation data ***")
        dev_data = ATEC_Dataset(os.path.join(args.data_dir, 'dev.csv'),
                                os.path.join(args.data_dir, 'vocab.csv'),
                                args.query_maxlen)
        dev_loader = DataLoader(dev_data,
                                shuffle=False,
                                batch_size=args.eval_batch_size)

        num_train_optimization_steps = args.train_steps

        # 过滤出需要梯度更新的参数
        parameters = filter(lambda p: p.requires_grad, model.parameters())
        # optimizer = optim.Adadelta(parameters, params["LEARNING_RATE"])
        optimizer = torch.optim.Adam(parameters, lr=args.learning_rate)
        # optimizer = torch.optim.Adam(model.parameters(), lr=lr)
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                               mode="max",
                                                               factor=0.85,
                                                               patience=0)
        criterion = nn.CrossEntropyLoss()

        global_step = 0

        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_data))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Gradient Accumulation steps = %d",
                    args.gradient_accumulation_steps)
        logger.info("  Num steps = %d", num_train_optimization_steps)

        best_acc = 0
        model.train()
        tr_loss = 0
        nb_tr_examples, nb_tr_steps = 0, 0
        bar = tqdm(range(num_train_optimization_steps),
                   total=num_train_optimization_steps)
        train_loader = cycle(train_loader)

        output_dir = args.output_dir + "eval_results_{}_{}_{}_{}_{}_{}".format(
            'ESIM', str(args.query_maxlen), str(args.learning_rate),
            str(args.train_batch_size), str(args.train_language),
            str(args.train_steps))
        try:
            os.makedirs(output_dir)
        except:
            pass
        output_eval_file = os.path.join(output_dir, 'eval_result.txt')
        with open(output_eval_file, "w") as writer:
            writer.write('*' * 80 + '\n')
        for step in bar:
            batch = next(train_loader)
            batch = tuple(t.to(device) for t in batch)
            q1, q1_lens, q2, q2_lens, labels = batch
            # 正常训练
            optimizer.zero_grad()
            logits, probs = model(q1, q1_lens, q2, q2_lens)
            loss = criterion(logits, labels)
            if args.n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu.
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps
            tr_loss += loss.item()
            train_loss = round(
                tr_loss * args.gradient_accumulation_steps / (nb_tr_steps + 1),
                4)
            bar.set_description("loss {}".format(train_loss))
            nb_tr_examples += q1.size(0)
            nb_tr_steps += 1

            loss.backward()
            # 对抗训练
            # fgm.attack() # 在embedding上添加对抗扰动
            # loss_adv = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids)
            # if args.n_gpu > 1:
            #     loss_adv = loss_adv.mean() # mean() to average on multi-gpu.
            # if args.gradient_accumulation_steps > 1:
            #     loss_adv = loss_adv / args.gradient_accumulation_steps
            # loss_adv.backward() # 反向传播,并在正常的grad基础上,累加对抗训练的梯度
            # fgm.restore() # 恢复embedding参数

            if (nb_tr_steps + 1) % args.gradient_accumulation_steps == 0:
                #                 scheduler.step()
                optimizer.step()
                global_step += 1

            if (step + 1) % (args.eval_steps *
                             args.gradient_accumulation_steps) == 0:
                tr_loss = 0
                nb_tr_examples, nb_tr_steps = 0, 0
                logger.info("***** Report result *****")
                logger.info("  %s = %s", 'global_step', str(global_step))
                logger.info("  %s = %s", 'train loss', str(train_loss))

            if args.do_eval and (step + 1) % (
                    args.eval_steps * args.gradient_accumulation_steps) == 0:
                if args.do_eval_train:
                    file_list = ['train.csv', 'dev.csv']
                else:
                    file_list = ['dev.csv']
                for file in file_list:
                    inference_labels = []
                    gold_labels = []
                    inference_logits = []

                    logger.info("***** Running evaluation *****")
                    logger.info("  Num examples = %d", len(dev_data))
                    logger.info("  Batch size = %d", args.eval_batch_size)

                    model.eval()
                    eval_loss, eval_accuracy = 0, 0
                    nb_eval_steps, nb_eval_examples = 0, 0
                    for q1, q1_lens, q2, q2_lens, labels in tqdm(dev_loader):
                        with torch.no_grad():
                            logits, probs = model(q1, q1_lens, q2, q2_lens)
                        probs = probs.detach().cpu().numpy()
                        # print(logits.shape, probs.shape)
                        # label_ids = labels.to('cpu').numpy()
                        inference_labels.append(np.argmax(probs, 1))
                        gold_labels.append(labels)
                        # eval_loss += tmp_eval_loss.mean().item()
                        nb_eval_examples += logits.size(0)
                        nb_eval_steps += 1

                    gold_labels = np.concatenate(gold_labels, 0)
                    inference_labels = np.concatenate(inference_labels, 0)
                    model.train()
                    eval_loss = eval_loss / nb_eval_steps
                    eval_accuracy = get_f1(inference_labels, gold_labels)

                    result = {
                        # 'eval_loss': eval_loss,
                        'eval_accuracy': eval_accuracy,
                        'global_step': global_step,
                        'train_loss': train_loss
                    }

                    if 'dev' in file:
                        with open(output_eval_file, "a") as writer:
                            writer.write(file + '\n')
                            for key in sorted(result.keys()):
                                logger.info("  %s = %s", key, str(result[key]))
                                writer.write("%s = %s\n" %
                                             (key, str(result[key])))
                            writer.write('*' * 80)
                            writer.write('\n')
                    if eval_accuracy > best_acc and 'dev' in file:
                        print("=" * 80)
                        print("Best ACC", eval_accuracy)
                        print("Saving Model......")
                        best_acc = eval_accuracy
                        # Save a trained model
                        model_to_save = model.module if hasattr(
                            model,
                            'module') else model  # Only save the model it-self
                        output_model_file = os.path.join(
                            output_dir, "pytorch_model.bin")
                        torch.save(model_to_save.state_dict(),
                                   output_model_file)
                        print("=" * 80)
                    else:
                        print("=" * 80)
        with open(output_eval_file, "a") as writer:
            writer.write('bert_acc: %f' % best_acc)

    if args.do_test:
        if args.do_train == False:
            output_dir = args.output_dir

        # build model
        logger.info("*** building model ***")
        embeddings = load_embeddings(args)
        model = ESIM(args.hidden_size,
                     embeddings=embeddings,
                     dropout=args.dropout,
                     num_classes=args.num_classes,
                     device=args.device)
        model.load_state_dict(
            torch.load(os.path.join(output_dir, 'pytorch_model.bin')))
        model.to(args.device)

        if args.n_gpu > 1:
            model = torch.nn.DataParallel(model)

        inference_labels = []
        gold_labels = []

        logger.info("*** Loading testing data ***")
        dev_data = ATEC_Dataset(os.path.join(args.data_dir, 'test.csv'),
                                os.path.join(args.data_dir, 'vocab.csv'),
                                args.query_maxlen)
        dev_loader = DataLoader(dev_data,
                                shuffle=False,
                                batch_size=args.eval_batch_size)

        logger.info(" *** Run Prediction ***")
        logger.info("  Num examples = %d", len(dev_data))
        logger.info("  Batch size = %d", args.eval_batch_size)

        model.eval()
        for q1, q1_lens, q2, q2_lens, labels in tqdm(dev_loader):
            with torch.no_grad():
                logits, probs = model(q1, q1_lens, q2, q2_lens)
            probs = probs.detach().cpu().numpy()
            inference_labels.append(np.argmax(probs, 1))
            gold_labels.append(labels)

        gold_labels = np.concatenate(gold_labels, 0)
        logits = np.concatenate(inference_labels, 0)
        test_f1 = get_f1(logits, gold_labels)
        logger.info('predict f1:{}'.format(str(test_f1)))
Beispiel #3
0
def main():
    parser = ArgumentParser()
    parser.add_argument("--epoch", type=int, required=True)
    parser.add_argument("--seed", type=int, required=True)
    parser.add_argument("--emb_file", type=str, required=True)
    parser.add_argument("--checkpoint", type=str, required=True)
    parser.add_argument("--save_dir", type=str, required=True)
    parser.add_argument("--train_file", type=str, required=True)
    parser.add_argument("--log_file", type=str, required=False)
    parser.add_argument("--ratio", type=str, required=True)
    parser.add_argument("--vocab_size", type=int, required=True)
    parser.add_argument("--emb_size", type=int, required=True)
    parser.add_argument("--learning_rate", type=float, required=True)
    parser.add_argument("--batch_size", type=int, required=True)
    parser.add_argument("--max_length", type=int, required=True)
    parser.add_argument("--max_grad_norm", type=int, required=True)

    args = parser.parse_args()

    split_ratio = [float(val) for val in args.ratio.split(",")]

    has_cuda = torch.cuda.is_available()

    random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)

    LOG_FORMAT = "%(asctime)s - %(levelname)s - %(message)s"
    DATE_FORMAT = "%m/%d/%Y %H:%M:%S %p"
    logging.basicConfig(filename=args.log_file,
                        level=logging.INFO,
                        format=LOG_FORMAT,
                        datefmt=DATE_FORMAT)

    logging.info("start preparing data")
    data_preprocessor = DataPreprocess()
    emb, word_idx_map = data_preprocessor.build_emb_vocab(args.emb_file)
    data_preprocessor.load(args.train_file, use_mask=False, is_test=False)
    train_dataset, dev_dataset = data_preprocessor.generate_train_dev_dataset(
        ratio=split_ratio)
    train_dataset, dev_dataset = CompDataSet(
        train_dataset,
        word_idx_map,
        max_len=args.max_length,
        emb_size=args.emb_size), CompDataSet(dev_dataset,
                                             word_idx_map,
                                             max_len=args.max_length,
                                             emb_size=args.emb_size)

    train_dataset = DataLoader(train_dataset,
                               batch_size=args.batch_size,
                               shuffle=True)
    dev_dataset = DataLoader(dev_dataset,
                             batch_size=args.batch_size,
                             shuffle=True)

    logging.info("init model")
    start_epoch = 0
    if args.checkpoint:
        model = torch.load(args.checkpoint)
        start_epoch = re.findall("\d+(?=\_\d+.pt)", args.checkpoint)
        start_epoch = int(start_epoch[0]) + 1
    else:
        model = ESIM(args.vocab_size,
                     args.emb_size,
                     emb,
                     max_len=args.max_length)

    optimizer = AdamW(model.parameters(), lr=args.learning_rate)
    criterion = FocalLoss()

    if has_cuda:
        model = model.cuda()

    logging.info("start training")
    neg_auc, pos_auc = validate(model, dev_dataset)
    logging.info(f"pre-train neg_auc {str(neg_auc)} pos_auc {str(pos_auc)}")

    for epoch in range(start_epoch, args.epoch):
        running_loss = 0.0
        for step, data in enumerate(train_dataset):
            model.train()
            start_time = time.time()
            optimizer.zero_grad()

            outputs = model(data["premise"], data["premise_mask"],
                            data["hypothese"], data["hypothese_mask"])
            loss = criterion(outputs["probs"], data["label"])
            loss.backward()

            clip_grad_norm_(model.parameters(), args.max_grad_norm)
            optimizer.step()

            end_time = time.time()
            running_loss += loss.item()
            if step % 100 == 99:
                logging.info(
                    f"epoch: {epoch}, step: {step}, time: {end_time - start_time} loss: {running_loss / 100}"
                )
                running_loss = 0
            if step % 500 == 499:
                neg_auc, pos_auc = validate(model, dev_dataset)
                logging.info(
                    f"pre-train neg_auc {str(neg_auc)} pos_auc {str(pos_auc)}")
                torch.save(model, Path(args.save_dir) / f"{epoch}_{step}.pt")