Beispiel #1
0
    writer = SummaryWriter(logdir=log_path + task_name + ".rank_" +
                           str(torch.distributed.get_rank()))
    f_log = open(
        log_path + task_name + ".rank_" + str(torch.distributed.get_rank()) +
        ".log", 'w')

trainer = Trainer(criterion, optimizer, n_class, size_g, size_p,
                  sub_batch_size, mode, lamb_fmreg)
evaluator = Evaluator(n_class, size_g, size_p, sub_batch_size, mode, test)

best_pred = 0.0
if (torch.distributed.get_rank() == 0):
    print("start training......")

for epoch in range(num_epochs):
    trainer.set_train(model_ddp)
    optimizer.zero_grad()

    # Huaxin: maybe should call scheduler outside of training epoch
    # scheduler(optimizer, epoch, epoch, best_pred)

    tbar = tqdm(dataloader_train)
    train_loss = 0
    for i_batch, sample_batched in enumerate(tbar):
        images, labels = sample_batched['image'], sample_batched[
            'label']  # PIL images
        #print(images[0].size, labels[0].size,)

        if evaluation:
            break
Beispiel #2
0
criterion = lambda x,y: criterion1(x, y)
# criterion = lambda x,y: 0.5*criterion1(x, y) + 0.5*criterion3(x, y)
mse = nn.MSELoss()

if not evaluation:
    
    writer = SummaryWriter(log_dir=os.path.join(log_path, task_name))
    f_log = open(os.path.join(log_path, task_name + ".log"), 'w')

trainer = Trainer(criterion, optimizer, n_class, size_g, size_p, sub_batch_size, mode, lamb_fmreg)
evaluator = Evaluator(n_class, size_g, size_p, sub_batch_size, mode, test)

best_pred = 0.0
print("start training......")
for epoch in range(num_epochs):
    trainer.set_train(model)
    optimizer.zero_grad()
    tbar = tqdm(dataloader_train); train_loss = 0
    for i_batch, sample_batched in enumerate(tbar):
        if evaluation: break
        scheduler(optimizer, i_batch, epoch, best_pred)
        loss = trainer.train(sample_batched, model, global_fixed)
        train_loss += loss.item()
        score_train, score_train_global, score_train_local = trainer.get_scores()
        if mode == 1: tbar.set_description('Train loss: %.3f; global mIoU: %.3f' % (train_loss / (i_batch + 1), np.mean(np.nan_to_num(score_train_global["iou"]))))
        else: tbar.set_description('Train loss: %.3f; agg mIoU: %.3f' % (train_loss / (i_batch + 1), np.mean(np.nan_to_num(score_train["iou"]))))

    score_train, score_train_global, score_train_local = trainer.get_scores()
    trainer.reset_metrics()
    # torch.cuda.empty_cache()
 criterion  = lambda x,y: criterion1(x, y)
 # criterion = lambda x,y: 0.5*criterion1(x, y) + 0.5*criterion3(x, y)
 mse = nn.MSELoss()
 
 if not evaluation:
     writer = SummaryWriter(logdir=log_path + task_name)
     f_log = open(log_path + task_name + ".log", 'w')
 
 trainer   = Trainer(criterion, optimizer, n_class, size_g, size_p, sub_batch_size, mode, lamb_fmreg)
 evaluator = Evaluator(n_class, size_g, size_p, sub_batch_size, mode, test)
 
 best_pred = 0.0
 print("start training......")
 for epoch in range(num_epochs):
     # Huaxin: model_ddp.train is not called.
     trainer.set_train(model_ddp, parallel=True)
     optimizer.zero_grad()
     
     # Huaxin: maybe should call scheduler outside of training epoch
     # scheduler(optimizer, epoch, epoch, best_pred)
     
     tbar = tqdm(dataloader_train); train_loss = 0
     for i_batch, sample_batched in enumerate(tbar):
         images, labels = sample_batched['image'], sample_batched['label'] # PIL images
         #print(images[0].size, labels[0].size,)
         
         if evaluation: 
             break
         
         scheduler(optimizer, i_batch, epoch, best_pred)