Example #1
0
def main(args):
    assert torch.cuda.is_available(), "need to use GPUs"

    use_cuda = torch.cuda.is_available()
    cuda_devices = list(map(int, args.cuda_devices.split(",")))
    is_multigpu = len(cuda_devices) > 1
    device = "cuda"

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.cuda.manual_seed(args.seed)
    if is_multigpu > 1:
        torch.cuda.manual_seed_all(args.seed)

    data = torch.load(args.data)
    dataset = BERTDataSet(data['word'], data['max_len'], data["dict"],
                          args.batch_size * args.steps)
    training_data = DataLoader(dataset,
                               batch_size=args.batch_size,
                               num_workers=args.num_cpus)

    model = BERT(dataset.word_size, data["max_len"], args.n_stack_layers,
                 args.d_model, args.d_ff, args.n_head, args.dropout)

    print(
        f"BERT have {sum(x.numel() for x in model.parameters())} paramerters in total"
    )

    optimizer = ScheduledOptim(
        torch.nn.DataParallel(
            torch.optim.Adam(model.get_trainable_parameters(),
                             lr=args.lr,
                             betas=(0.9, 0.999),
                             eps=1e-09,
                             weight_decay=0.01),
            device_ids=cuda_devices), args.d_model, args.n_warmup_steps)

    w_criterion = WordCrossEntropy()
    w_criterion = w_criterion.to(device)

    s_criterion = torch.nn.CrossEntropyLoss()

    model = model.to(device)
    model = torch.nn.DataParallel(model, device_ids=cuda_devices)
    model.train()
    for step, datas in enumerate(training_data):
        inp, pos, sent_label, word_label, segment_label = list(
            map(lambda x: x.to(device), datas))
        sent_label = sent_label.view(-1)
        optimizer.zero_grad()
        word, sent = model(inp, pos, segment_label)
        w_loss, w_corrects, tgt_sum = w_criterion(word, word_label)
        s_loss = s_criterion(sent, sent_label)
        if is_multigpu:
            w_loss, s_loss = w_loss.mean(), s_loss.mean()
        loss = w_loss + s_loss
        loss.backward()
        optimizer.step()
        s_corrects = (torch.max(sent, 1)[1].data == sent_label.data).sum()

        print(
            f"[Step {step+1}/{args.steps}] [word_loss: {w_loss:.5f}, sent_loss: {s_loss:.5f}, loss: {loss:.5f}, w_pre: {w_corrects/tgt_sum*100:.2f}% {w_corrects}/{tgt_sum}, s_pre: {float(s_corrects)/args.batch_size*100:.2f}% {s_corrects}/{args.batch_size}]"
        )

        if tf is not None:
            add_summary_value("Word loss", w_loss, step)
            add_summary_value("Sent loss", s_loss, step)
            add_summary_value("Loss", loss, step)
            add_summary_value("Word predict", w_corrects / tgt_sum, step)
            add_summary_value("Sent predict",
                              float(s_corrects) / args.batch_size, step)
            tf_summary_writer.flush()
Example #2
0
def run_training_bert(args, dataset, train_loader, val_loader, vocab_size):
    checkpoint_path = os.path.join(args.checkpoint_path, args.checkpoint)
    device = torch.device("cuda:" +
                          args.device if torch.cuda.is_available() else "cpu")

    model = BERT().to(device)

    # Initialize BCELoss function
    # criterion = nn.BCEWithLogitsLoss()
    # Setup Adam optimizers for both G and D
    optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=1e-5)
    model.train()  # turn on training mode
    # Training Loop
    print("Starting Training Loop...")
    # For each epoch
    for epoch in range(args.epochs):
        # For each batch in the dataloader
        losses = []
        running_corrects = 0
        for i, batch in enumerate(train_loader):
            # format batch
            text, context, label = batch.text, batch.context, batch.label
            # print(text.tolist()[0])
            # print(label.tolist()[0])
            label = label.type(torch.LongTensor).to(device)
            text = text.type(torch.LongTensor).to(device)

            output = model(text, label)
            loss, _ = output

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            losses.append(loss.item())
        epoch_loss = sum(losses) / len(losses)
        print('Epoch: {}, Training Loss: {:.4f}'.format(epoch, epoch_loss))
        # save model
        if epoch % 1 == 0 or epoch == args.epochs - 1:
            torch.save(
                {
                    'epoch': epoch + 1,
                    'model_state_dict': model.state_dict(),
                    'optimizer_state_dict': optimizer.state_dict(),
                    'vocab_size': vocab_size,
                    'args': vars(args)
                }, checkpoint_path)
            if args.eval:
                model.eval()
                with torch.no_grad():
                    preds = []
                    labels = []
                    eval_losses = []
                    for i, batch in enumerate(val_loader if val_loader
                                              is not None else train_loader):
                        text, context, label = batch.text, batch.context, batch.label
                        label = label.type(torch.LongTensor).to(device)
                        text = text.type(torch.LongTensor).to(device)
                        output = model(text, label)
                        loss, output = output
                        pred = torch.argmax(output, 1).tolist()
                        preds.extend(pred)
                        labels.extend(label.tolist())
                        eval_losses.append(loss.item())
                    print("{} Precision: {}, Recall: {}, F1: {}, Loss: {}".
                          format(
                              "Train" if val_loader is None else "Valid",
                              sklearn.metrics.precision_score(
                                  np.array(labels).astype('int32'),
                                  np.array(preds)),
                              sklearn.metrics.recall_score(
                                  np.array(labels).astype('int32'),
                                  np.array(preds)),
                              sklearn.metrics.f1_score(
                                  np.array(labels).astype('int32'),
                                  np.array(preds)), np.average(eval_losses)))
Example #3
0
        num_training_steps=total_steps)
    #t_total = total_steps)
else:
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        # num_warmup_steps = 0,
        warmup_steps=int(0.1 * total_steps),  # Default value in run_glue.py
        # num_training_steps = total_steps)
        t_total=total_steps)
loss_values = []
best_eval_acc = 0
test_acc = 0

for epoch_i in range(0, args.epochs):
    total_loss = 0
    model.train()

    # For each batch of training data...
    for step, batch in tqdm(enumerate(train_dataloader)):
        model.train()

        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        outputs = model(b_input_ids, b_input_mask, b_labels)

        loss = outputs[0]
        loss.backward()

        total_loss += loss.item()