Ejemplo n.º 1
0
    train_loader = DataLoader(dataset=train_dataset, 
                      batch_size=batch_size, 
                      shuffle=False)   # 2 leads to device side asserts...  , num_workers=1

    try:
      idx_loss_check, loss_recent_tot = 0, 0.
      for epoch in range(epoch_start+1, epoch_max):  # So this refers to the epoch-end value
        time_estimate_last = t_start = time.time()

        model_stepwise.train()
      
        for idx, (features, labels, deps) in enumerate(train_loader):
          features, labels, deps = features.to(device), labels.to(device), deps.to(device)
          
          model_opt.zero_grad()
          out_class_logits, out_deps_logits = model_stepwise(features)
          
          #batch_loss = ce_loss(output, target)
          
          # https://pytorch.org/docs/stable/nn.html?highlight=loss#torch.nn.BCEWithLogitsLoss
          class_loss = nn.CrossEntropyLoss(reduction='none')( out_class_logits, labels )
          #print("class_loss.size()=", class_loss.size())
          #       class_loss.size()= torch.Size([8, 128])
          class_loss_tot = class_loss.sum()
          
          # The dep loss should be ignored for those deps which == 0
          dep_loss = nn.CrossEntropyLoss(reduction='none')( out_deps_logits, deps )
          #print("dep_loss.size()=", dep_loss.size())
          #       dep_loss.size()= torch.Size([8, 128])
Ejemplo n.º 2
0
def run_epoch2(train, test):

    train = LM_Dataset(train, batch_size=16)
    test = LM_Dataset(test, batch_size=16)

    opt = OpenAIAdam(dh_model.parameters(),
                     lr=6.25e-5,
                     schedule='warmup_linear',
                     warmup=0.002,
                     t_total=train.n_batches * 3,
                     b1=.9,
                     b2=.999,
                     e=1e-8,
                     l2=0.01,
                     vector_l2=True,
                     max_grad_norm=1)

    #opt = torch.optim.Adam(lr=6.25e-5,params=dh_model.parameters())
    opt = Adam16(lr=6.25e-5, params=dh_model.parameters())
    #opt = torch.optim.SGD(lr=6.25e-5,params=dh_model.parameters())

    opt = FP16_Optimizer(opt, static_loss_scale=1, dynamic_loss_scale=False)

    criterion = nn.CrossEntropyLoss(reduce=False)

    L = LangModelLoss(criterion, opt=opt)

    avg_loss_train, avg_loss_test = 0, 0
    acc_train, acc_test = 0, 0

    for i in tqdm(range(train.n_batches)):

        data = train.next()
        data, mask = transform_data(data)
        data = torch.from_numpy(data).long()
        mask = torch.from_numpy(mask)

        opt.zero_grad()

        if GPU:
            data = data.cuda()
            mask = mask.cuda().half()

        lm_logits, clf_logits = dh_model(data)

        loss = L(data, mask, lm_logits=lm_logits, only_return_losses=False)
        print(loss)
        avg_loss_train += loss

    print('Training Loss: ', avg_loss_train / len(train_loader))

    for i in tqdm(range(test.n_batches)):

        data = train.next()
        data, mask = transform_data(data)
        data = torch.from_numpy(data).long()
        mask = torch.from_numpy(mask)

        opt.zero_grad()

        if GPU:
            data = data.cuda()
            mask = mask.cuda().half()

        lm_logits, clf_logits = dh_model(data)
        loss = L(data, mask, lm_logits=lm_logits, only_return_losses=True)

        avg_loss_test += loss

    print('Test Loss: ', avg_loss_test / len(test_loader))
Ejemplo n.º 3
0
        else:
            ce_loss_x, ce_loss_s, scl_loss = model(batch)
        ce_loss = (ce_loss_x + ce_loss_s) / 2
        if not args.with_summary:
            ce_loss = ce_loss_x
        loss = args.lambd * ce_loss + (1 - args.lambd) * scl_loss

        # print(ce_loss_x, ce_loss_s, scl_loss)
        loss.backward()

        count += 1
        if (count % args.num_accum == 0):
            optimizer.step()
            recoder.log_train(ce_loss_x, ce_loss_s, scl_loss, loss)
            step += 1
            optimizer.zero_grad()

            if (step >= args.steps):
                break

            if (step % args.log_step == 0):
                begin_eval = True

            if (step % 10 == 0):
                bar.update(10)

        # step += 1

        if begin_eval:
            recoder.meter(step)
            evaluate_model(model, test_loader, recoder, step)
Ejemplo n.º 4
0
def run_epoch(train_loader, test_loader):

    opt = OpenAIAdam(dh_model.parameters(),
                     lr=6.25e-5,
                     schedule='warmup_linear',
                     warmup=0.002,
                     t_total=len(train_loader) * 3,
                     b1=.9,
                     b2=.999,
                     e=1e-8,
                     l2=0.01,
                     vector_l2=True,
                     max_grad_norm=1)

    opt = torch.optim.Adam(lr=6.25e-5, params=dh_model.parameters())

    print(half)

    if half:

        opt = Adam16(lr=6.25e-5, params=dh_model.parameters())

    criterion = nn.CrossEntropyLoss(reduce=False)

    L = LangModelLoss(criterion, opt=opt)

    avg_loss_train, avg_loss_test = 0, 0
    acc_train, acc_test = 0, 0

    for (data, mask), target in tqdm(train_loader):
        opt.zero_grad()

        if GPU:
            data = data.cuda()
            target = target.cuda()
            mask = mask.cuda()  #.half()

        if half:
            mask = mask.half()

        lm_logits, clf_logits = dh_model(data)

        loss = L(data, mask, lm_logits=lm_logits, only_return_losses=False)
        print(loss)
        avg_loss_train += loss

    print('Training Loss: ', avg_loss_train / len(train_loader))

    for (data, mask), target in tqdm(test_loader):

        opt.zero_grad()

        if GPU:
            data = data.cuda()
            target = target.cuda()
            mask = mask.cuda()  #.half()

        if half:
            mask = mask.half()

        lm_logits, clf_logits = dh_model(data)
        loss = L(data, mask, lm_logits=lm_logits, only_return_losses=True)

        avg_loss_test += loss

    print('Test Loss: ', avg_loss_test / len(test_loader))