train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=False) # 2 leads to device side asserts... , num_workers=1 try: idx_loss_check, loss_recent_tot = 0, 0. for epoch in range(epoch_start+1, epoch_max): # So this refers to the epoch-end value time_estimate_last = t_start = time.time() model_stepwise.train() for idx, (features, labels, deps) in enumerate(train_loader): features, labels, deps = features.to(device), labels.to(device), deps.to(device) model_opt.zero_grad() out_class_logits, out_deps_logits = model_stepwise(features) #batch_loss = ce_loss(output, target) # https://pytorch.org/docs/stable/nn.html?highlight=loss#torch.nn.BCEWithLogitsLoss class_loss = nn.CrossEntropyLoss(reduction='none')( out_class_logits, labels ) #print("class_loss.size()=", class_loss.size()) # class_loss.size()= torch.Size([8, 128]) class_loss_tot = class_loss.sum() # The dep loss should be ignored for those deps which == 0 dep_loss = nn.CrossEntropyLoss(reduction='none')( out_deps_logits, deps ) #print("dep_loss.size()=", dep_loss.size()) # dep_loss.size()= torch.Size([8, 128])
def run_epoch2(train, test): train = LM_Dataset(train, batch_size=16) test = LM_Dataset(test, batch_size=16) opt = OpenAIAdam(dh_model.parameters(), lr=6.25e-5, schedule='warmup_linear', warmup=0.002, t_total=train.n_batches * 3, b1=.9, b2=.999, e=1e-8, l2=0.01, vector_l2=True, max_grad_norm=1) #opt = torch.optim.Adam(lr=6.25e-5,params=dh_model.parameters()) opt = Adam16(lr=6.25e-5, params=dh_model.parameters()) #opt = torch.optim.SGD(lr=6.25e-5,params=dh_model.parameters()) opt = FP16_Optimizer(opt, static_loss_scale=1, dynamic_loss_scale=False) criterion = nn.CrossEntropyLoss(reduce=False) L = LangModelLoss(criterion, opt=opt) avg_loss_train, avg_loss_test = 0, 0 acc_train, acc_test = 0, 0 for i in tqdm(range(train.n_batches)): data = train.next() data, mask = transform_data(data) data = torch.from_numpy(data).long() mask = torch.from_numpy(mask) opt.zero_grad() if GPU: data = data.cuda() mask = mask.cuda().half() lm_logits, clf_logits = dh_model(data) loss = L(data, mask, lm_logits=lm_logits, only_return_losses=False) print(loss) avg_loss_train += loss print('Training Loss: ', avg_loss_train / len(train_loader)) for i in tqdm(range(test.n_batches)): data = train.next() data, mask = transform_data(data) data = torch.from_numpy(data).long() mask = torch.from_numpy(mask) opt.zero_grad() if GPU: data = data.cuda() mask = mask.cuda().half() lm_logits, clf_logits = dh_model(data) loss = L(data, mask, lm_logits=lm_logits, only_return_losses=True) avg_loss_test += loss print('Test Loss: ', avg_loss_test / len(test_loader))
else: ce_loss_x, ce_loss_s, scl_loss = model(batch) ce_loss = (ce_loss_x + ce_loss_s) / 2 if not args.with_summary: ce_loss = ce_loss_x loss = args.lambd * ce_loss + (1 - args.lambd) * scl_loss # print(ce_loss_x, ce_loss_s, scl_loss) loss.backward() count += 1 if (count % args.num_accum == 0): optimizer.step() recoder.log_train(ce_loss_x, ce_loss_s, scl_loss, loss) step += 1 optimizer.zero_grad() if (step >= args.steps): break if (step % args.log_step == 0): begin_eval = True if (step % 10 == 0): bar.update(10) # step += 1 if begin_eval: recoder.meter(step) evaluate_model(model, test_loader, recoder, step)
def run_epoch(train_loader, test_loader): opt = OpenAIAdam(dh_model.parameters(), lr=6.25e-5, schedule='warmup_linear', warmup=0.002, t_total=len(train_loader) * 3, b1=.9, b2=.999, e=1e-8, l2=0.01, vector_l2=True, max_grad_norm=1) opt = torch.optim.Adam(lr=6.25e-5, params=dh_model.parameters()) print(half) if half: opt = Adam16(lr=6.25e-5, params=dh_model.parameters()) criterion = nn.CrossEntropyLoss(reduce=False) L = LangModelLoss(criterion, opt=opt) avg_loss_train, avg_loss_test = 0, 0 acc_train, acc_test = 0, 0 for (data, mask), target in tqdm(train_loader): opt.zero_grad() if GPU: data = data.cuda() target = target.cuda() mask = mask.cuda() #.half() if half: mask = mask.half() lm_logits, clf_logits = dh_model(data) loss = L(data, mask, lm_logits=lm_logits, only_return_losses=False) print(loss) avg_loss_train += loss print('Training Loss: ', avg_loss_train / len(train_loader)) for (data, mask), target in tqdm(test_loader): opt.zero_grad() if GPU: data = data.cuda() target = target.cuda() mask = mask.cuda() #.half() if half: mask = mask.half() lm_logits, clf_logits = dh_model(data) loss = L(data, mask, lm_logits=lm_logits, only_return_losses=True) avg_loss_test += loss print('Test Loss: ', avg_loss_test / len(test_loader))