def validation(model, valid_loader, criterion): model.eval() losses = AverageMeter() for i, (img, train_mask, tr_mask, tcl_mask, radius_map, sin_map, cos_map, meta) in enumerate(valid_loader): print(meta['image_id']) img, train_mask, tr_mask, tcl_mask, radius_map, sin_map, cos_map = to_device( img, train_mask, tr_mask, tcl_mask, radius_map, sin_map, cos_map) output = model(img) tr_loss, tcl_loss, sin_loss, cos_loss, radii_loss = \ criterion(output, tr_mask, tcl_mask, sin_map, cos_map, radius_map, train_mask) loss = tr_loss + tcl_loss + sin_loss + cos_loss + radii_loss losses.update(loss.item()) if cfg.viz and i < cfg.vis_num: visualize_network_output(output, tr_mask, tcl_mask, prefix='val_{}'.format(i)) if i % cfg.display_freq == 0: print( 'Validation: - Loss: {:.4f} - tr_loss: {:.4f} - tcl_loss: {:.4f} - sin_loss: {:.4f} - cos_loss: {:.4f} - radii_loss: {:.4f}' .format(loss.item(), tr_loss.item(), tcl_loss.item(), sin_loss.item(), cos_loss.item(), radii_loss.item())) print('Validation Loss: {}'.format(losses.avg))
def train(model, train_loader, train_data, test_data, val_data, scheduler, optimizer, epoch): global train_step global accuracy_tests global accuracy_trains global accuracy_vals losses = AverageMeter(max=100) model.train() # scheduler.step() print('Epoch: {} : LR = {}'.format(epoch, scheduler.get_lr())) for i, data in enumerate(train_loader): train_step += 1 data = to_device(data) if data.shape[0] != cfg.batch_size: continue output = model(data[:, 1:]) target = data[:, 0].long() loss = F.nll_loss(output, target) optimizer.zero_grad() loss.backward() optimizer.step() losses.update(loss.item()) gc.collect() if i % cfg.display_freq == 0: print("({:d} / {:d}), loss: {:.3f}".format(i, len(train_loader), loss.item())) if epoch % cfg.save_freq == 0: labels_test = test_data[:, 0].long() output_test = model(test_data[:, 1:]) pred_test = output_test.data.max(1, keepdim=True)[1] correct_test = pred_test.eq(labels_test.data.view_as(pred_test)).cpu().sum() accuracy_test = correct_test*100.0/labels_test.shape[0] accuracy_tests.append(round(accuracy_test.item(), 3)) labels_train = train_data[:, 0].long() output_train = model(train_data[:, 1:]) pred_train = output_train.data.max(1, keepdim=True)[1] correct_train = pred_train.eq(labels_train.data.view_as(pred_train)).cpu().sum() accuracy_train = correct_train * 100.0 / labels_train.shape[0] accuracy_trains.append(round(accuracy_train.item(), 3)) labels_val = val_data[:, 0].long() output_val = model(val_data[:, 1:]) pred_val = output_val.data.max(1, keepdim=True)[1] correct_val = pred_val.eq(labels_val.data.view_as(pred_val)).cpu().sum() accuracy_val = correct_val * 100.0 / labels_val.shape[0] accuracy_vals.append(round(accuracy_val.item(), 3)) print("accuracy_train: {}; accuracy_val: {}; accuracy_test: {}" .format(accuracy_train, accuracy_val, accuracy_test)) # if epoch % cfg.save_freq == 0: # save_model(model, epoch, scheduler.get_lr(), optimizer) print('Training Loss: {}'.format(losses.avg))
def validation(model, valid_loader, criterion): with torch.no_grad(): model.eval() losses = AverageMeter() reg_losses = AverageMeter() center_loss = AverageMeter() region_loss = AverageMeter() for i, (img, reg_mask, meta) in enumerate(valid_loader): img, reg_mask = to_device(img, reg_mask) output = model(img) loss_reg, loss_dice_center, loss_dice_region = criterion( output, reg_mask) loss = loss_reg + loss_dice_center + loss_dice_region losses.update(loss.item()) reg_losses.update(loss_reg.item()) center_loss.update(loss_dice_center.item()) region_loss.update(loss_dice_region.item()) if cfg.visualization and i % cfg.visualization_frequency == 0: visualize_network_output(img, output, reg_mask, mode='val') print( 'Validation: - Loss: {:.4f} - Reg_Loss: {:.4f} - Center_Dice_Loss: {:.4f} - Region_Dice_Loss: {:.4f}' .format(loss.item(), loss_reg.item(), loss_dice_center.item(), loss_dice_region.item())) print('Validation Loss: {}'.format(losses.avg)) print('Regression Loss: {}'.format(reg_losses.avg)) print('Center Dice Loss: {}'.format(center_loss.avg)) print('Region Dice Loss: {}'.format(region_loss.avg))
def train(model, train_loader, criterion, scheduler, optimizer, epoch): losses = AverageMeter() reg_losses = AverageMeter() center_loss = AverageMeter() region_loss = AverageMeter() model.train() print('Epoch: {} : LR = {}'.format(epoch, optimizer.param_groups[0]['lr'])) for i, (img, reg_mask, meta) in enumerate(train_loader): scheduler.step() if img is None: print("Exception loading data! Preparing loading next batch data!") continue img, reg_mask = to_device(img, reg_mask) output = model(img) loss_reg, loss_dice_center, loss_dice_region = criterion( output, reg_mask) loss = loss_reg + loss_dice_center + loss_dice_region optimizer.zero_grad() loss.backward() optimizer.step() losses.update(loss.item()) reg_losses.update(loss_reg.item()) center_loss.update(loss_dice_center.item()) region_loss.update(loss_dice_region.item()) if cfg.visualization and i % cfg.visualization_frequency == 0: visualize_network_output(img, output, reg_mask, mode='train') print( '[{:d} | {:d}] - Loss: {:.4f} - Reg_Loss: {:.4f} - Center_Dice_Loss: {:.4f} - Region_Dice_Loss: {:.4f} - LR: {:e}' .format(i, len(train_loader), loss.item(), loss_reg.item(), loss_dice_center.item(), loss_dice_region.item(), optimizer.param_groups[0]['lr'])) if epoch % cfg.save_frequency == 0: save_model(model, epoch, scheduler.get_lr(), optimizer)
def train(model, train_loader, criterion, scheduler, optimizer, epoch): start = time.time() losses = AverageMeter() batch_time = AverageMeter() data_time = AverageMeter() end = time.time() model.train() for i, (img, train_mask, tr_mask, tcl_mask, radius_map, sin_map, cos_map, meta) in enumerate(train_loader): data_time.update(time.time() - end) img, train_mask, tr_mask, tcl_mask, radius_map, sin_map, cos_map = to_device( img, train_mask, tr_mask, tcl_mask, radius_map, sin_map, cos_map) output = model(img) tr_loss, tcl_loss, sin_loss, cos_loss, radii_loss = \ criterion(output, tr_mask, tcl_mask, sin_map, cos_map, radius_map, train_mask) loss = tr_loss + tcl_loss + sin_loss + cos_loss + radii_loss # backward scheduler.step() optimizer.zero_grad() loss.backward() optimizer.step() losses.update(loss.item()) # measure elapsed time batch_time.update(time.time() - end) end = time.time() if cfg.viz and i < cfg.vis_num: visualize_network_output(output, tr_mask, tcl_mask, prefix='train_{}'.format(i)) if i % cfg.display_freq == 0: print( 'Epoch: [ {} ][ {:03d} / {:03d} ] - Loss: {:.4f} - tr_loss: {:.4f} - tcl_loss: {:.4f} - sin_loss: {:.4f} - cos_loss: {:.4f} - radii_loss: {:.4f}' .format(epoch, i, len(train_loader), loss.item(), tr_loss.item(), tcl_loss.item(), sin_loss.item(), cos_loss.item(), radii_loss.item())) if epoch % cfg.save_freq == 0 and epoch > 0: save_model(model, epoch, scheduler.get_lr()) print('Training Loss: {}'.format(losses.avg))
def train(train_loader, model, criterion, optimizer, writer, epoch, no_cuda=False, log_interval=25, **kwargs): """ Training routine Parameters ---------- :param train_loader : torch.utils.data.DataLoader The dataloader of the train set. :param model : torch.nn.module The network model being used. :param criterion : torch.nn.loss The loss function used to compute the loss of the model. :param optimizer : torch.optim The optimizer used to perform the weight update. :param writer : tensorboardX.writer.SummaryWriter The tensorboard writer object. Used to log values on file for the tensorboard visualization. :param epoch : int Number of the epoch (for logging purposes). :param no_cuda : boolean Specifies whether the GPU should be used or not. A value of 'True' means the CPU will be used. :param log_interval : int Interval limiting the logging of mini-batches. Default value of 10. :return: None """ multi_run = kwargs['run'] if 'run' in kwargs else None # Instantiate the counters batch_time = AverageMeter() losses = AverageMeter() top1 = AverageMeter() data_time = AverageMeter() # Switch to train mode (turn on dropout & stuff) model.train() # Iterate over whole training set end = time.time() pbar = tqdm(enumerate(train_loader), total=len(train_loader), unit='batch', ncols=150, leave=False) for batch_idx, data in pbar: # pmu is_sequence = True if len(data) == 3 else False if is_sequence: in_data, length, target = sort_sequences_desc_order(data) else: in_data, target = data # Measure data loading time data_time.update(time.time() - end) # Moving data to GPU if not no_cuda: in_data = in_data.cuda(async=True) target = target.cuda(async=True) # pmu length = length.cuda(async=True) if is_sequence else None # pmu del # Convert the input and its labels to Torch Variables input_var = in_data # torch.autograd.Variable(input) target_var = target # torch.autograd.Variable(target) # Compute output # pmu if is_sequence: model.zero_grad() output = model((input_var, length)) # output = output.view(output.size(0), 2) # target_var = target_var.view(output.size(0)) else: output = model(input_var) # Compute and record the loss loss = criterion(output, target_var) losses.update(loss.item(), input_var.size(0)) # Compute and record the accuracy # acc1, acc5 = accuracy(output.data, target, topk=(1, 5)) acc1 = accuracy(output.data, target_var, topk=(1, ))[0] top1.update(acc1[0], input_var.size(0)) # top5.update(acc5[0], input.size(0)) # Add loss and accuracy to Tensorboard if multi_run == None: writer.add_scalar('train/mb_loss', loss.item(), epoch * len(train_loader) + batch_idx) writer.add_scalar('train/mb_accuracy', acc1.cpu().numpy(), epoch * len(train_loader) + batch_idx) else: writer.add_scalar('train/mb_loss_{}'.format(multi_run), loss.item(), epoch * len(train_loader) + batch_idx) writer.add_scalar('train/mb_accuracy_{}'.format(multi_run), acc1.cpu().numpy(), epoch * len(train_loader) + batch_idx) # Reset gradient optimizer.zero_grad() # Compute gradients loss.backward() # Perform a step by updating the weights optimizer.step() # Measure elapsed time batch_time.update(time.time() - end) end = time.time() # Log to console if batch_idx % log_interval == 0: pbar.set_description('train epoch [{0}][{1}/{2}]\t'.format( epoch, batch_idx, len(train_loader))) pbar.set_postfix( Time='{batch_time.avg:.3f}\t'.format(batch_time=batch_time), Loss='{loss.avg:.4f}\t'.format(loss=losses), Acc1='{top1.avg:.3f}\t'.format(top1=top1), Data='{data_time.avg:.3f}\t'.format(data_time=data_time))
def train(train_loader, model, criterion, optimizer, writer, epoch, no_cuda=False, log_interval=25, **kwargs): """ Training routine Parameters ---------- train_loader : torch.utils.data.DataLoader The dataloader of the train set. model : torch.nn.module The network model being used. criterion : torch.nn.loss The loss function used to compute the loss of the model. optimizer : torch.optim The optimizer used to perform the weight update. writer : tensorboardX.writer.SummaryWriter The tensorboard writer object. Used to log values on file for the tensorboard visualization. epoch : int Number of the epoch (for logging purposes). no_cuda : boolean Specifies whether the GPU should be used or not. A value of 'True' means the CPU will be used. log_interval : int Interval limiting the logging of mini-batches. Default value of 10. Returns ---------- top1.avg : float Accuracy of the model of the evaluated split """ multi_run = kwargs['run'] if 'run' in kwargs else None # Instantiate the counters batch_time = AverageMeter() loss_meter = AverageMeter() acc_meter = AverageMeter() data_time = AverageMeter() # Switch to train mode (turn on dropout & stuff) model.train() # Iterate over whole training set end = time.time() pbar = tqdm(enumerate(train_loader), total=len(train_loader), unit='batch', ncols=150, leave=False) for batch_idx, (input, target) in pbar: # Measure data loading time data_time.update(time.time() - end) # Moving data to GPU if not no_cuda: input = input.cuda(async=True) target = target.cuda(async=True) # Convert the input and its labels to Torch Variables input_var = torch.autograd.Variable(input) target_var = torch.autograd.Variable(target) acc, loss = train_one_mini_batch(model, criterion, optimizer, input_var, target_var, loss_meter, acc_meter) # Add loss and accuracy to Tensorboard if multi_run is None: writer.add_scalar('train/mb_loss', loss.data.item(), epoch * len(train_loader) + batch_idx) writer.add_scalar('train/mb_accuracy', acc.cpu().numpy(), epoch * len(train_loader) + batch_idx) else: writer.add_scalar('train/mb_loss_{}'.format(multi_run), loss.data.item(), epoch * len(train_loader) + batch_idx) writer.add_scalar('train/mb_accuracy_{}'.format(multi_run), acc.cpu().numpy(), epoch * len(train_loader) + batch_idx) # Measure elapsed time batch_time.update(time.time() - end) end = time.time() # Log to console if batch_idx % log_interval == 0: pbar.set_description('train epoch [{0}][{1}/{2}]\t'.format( epoch, batch_idx, len(train_loader))) pbar.set_postfix( Time='{batch_time.avg:.3f}\t'.format(batch_time=batch_time), Loss='{loss.avg:.4f}\t'.format(loss=loss_meter), Acc1='{acc_meter.avg:.3f}\t'.format(acc_meter=acc_meter), Data='{data_time.avg:.3f}\t'.format(data_time=data_time)) # Logging the epoch-wise accuracy if multi_run is None: writer.add_scalar('train/accuracy', acc_meter.avg, epoch) else: writer.add_scalar('train/accuracy_{}'.format(multi_run), acc_meter.avg, epoch) logging.debug( 'Train epoch[{}]: ' 'Acc@1={acc_meter.avg:.3f}\t' 'Loss={loss.avg:.4f}\t' 'Batch time={batch_time.avg:.3f} ({data_time.avg:.3f} to load data)'. format(epoch, batch_time=batch_time, data_time=data_time, loss=loss_meter, acc_meter=acc_meter)) return acc_meter.avg
def _evaluate(data_loader, model, criterion, writer, epoch, logging_label, no_cuda=False, log_interval=10, **kwargs): """ The evaluation routine Parameters ---------- :param data_loader : torch.utils.data.DataLoader The dataloader of the evaluation set :param model : torch.nn.module The network model being used :param criterion: torch.nn.loss The loss function used to compute the loss of the model :param writer : tensorboardX.writer.SummaryWriter The tensorboard writer object. Used to log values on file for the tensorboard visualization. :param epoch : int Number of the epoch (for logging purposes) :param logging_label : string Label for logging purposes. Typically 'test' or 'valid'. Its prepended to the logging output path and messages. :param no_cuda : boolean Specifies whether the GPU should be used or not. A value of 'True' means the CPU will be used. :param log_interval : int Interval limiting the logging of mini-batches. Default value of 10. :return: None """ multi_run = kwargs['run'] if 'run' in kwargs else None # Instantiate the counters batch_time = AverageMeter() losses = AverageMeter() top1 = AverageMeter() data_time = AverageMeter() # Switch to evaluate mode (turn off dropout & such ) model.eval() # Iterate over whole evaluation set end = time.time() # Empty lists to store the predictions and target values preds = [] targets = [] pbar = tqdm(enumerate(data_loader), total=len(data_loader), unit='batch', ncols=150, leave=False) for batch_idx, data in pbar: # pmu is_sequence = True if len(data) == 3 else False if is_sequence: in_data, length, target = sort_sequences_desc_order(data) else: in_data, target = data # Measure data loading time data_time.update(time.time() - end) # Moving data to GPU if not no_cuda: in_data = in_data.cuda(async=True) target = target.cuda(async=True) # pmu length = length.cuda(async=True) if is_sequence else None # pmu # Convert the input and its labels to Torch Variables input_var = in_data # torch.autograd.Variable(input, volatile=True) target_var = target # torch.autograd.Variable(target, volatile=True) # Compute output # pmu if is_sequence: model.zero_grad() output = model((input_var, length)) output = output.view(output.size(0), 2) target_var = target_var.view(output.size(0)) else: output = model(input_var) # Compute and record the loss loss = criterion(output, target_var) losses.update(loss.item(), input_var.size(0)) # Compute and record the accuracy acc1 = accuracy(output, target_var, topk=(1, ))[0] top1.update(acc1[0], target_var.size(0)) # Get the predictions _ = [ preds.append(item) for item in [np.argmax(item) for item in output.data.cpu().numpy()] ] _ = [targets.append(item) for item in target_var.cpu().numpy()] # Add loss and accuracy to Tensorboard if multi_run is None: writer.add_scalar(logging_label + '/mb_loss', loss.item(), epoch * len(data_loader) + batch_idx) writer.add_scalar(logging_label + '/mb_accuracy', acc1.cpu().numpy(), epoch * len(data_loader) + batch_idx) else: writer.add_scalar(logging_label + '/mb_loss_{}'.format(multi_run), loss.item(), epoch * len(data_loader) + batch_idx) writer.add_scalar( logging_label + '/mb_accuracy_{}'.format(multi_run), acc1.cpu().numpy(), epoch * len(data_loader) + batch_idx) # Measure elapsed time batch_time.update(time.time() - end) end = time.time() if batch_idx % log_interval == 0: pbar.set_description(logging_label + ' epoch [{0}][{1}/{2}]\t'.format( epoch, batch_idx, len(data_loader))) pbar.set_postfix( Time='{batch_time.avg:.3f}\t'.format(batch_time=batch_time), Loss='{loss.avg:.4f}\t'.format(loss=losses), Acc1='{top1.avg:.3f}\t'.format(top1=top1), Data='{data_time.avg:.3f}\t'.format(data_time=data_time))
def train(model, train_loader, criterion, scheduler, optimizer, epoch, summary_writer): start = time.time() losses = AverageMeter() batch_time = AverageMeter() data_time = AverageMeter() end = time.time() model.train() global total_iter for i, (img, train_mask, tr_mask, tcl_mask, radius_map, sin_map, cos_map, meta) in enumerate(train_loader): data_time.update(time.time() - end) img, train_mask, tr_mask, tcl_mask, radius_map, sin_map, cos_map = to_device( img, train_mask, tr_mask, tcl_mask, radius_map, sin_map, cos_map) output = model(img) tr_loss, tcl_loss, sin_loss, cos_loss, radii_loss = \ criterion(output, tr_mask, tcl_mask, sin_map, cos_map, radius_map, train_mask, total_iter) loss = tr_loss + tcl_loss + sin_loss + cos_loss + radii_loss # backward # scheduler.step() optimizer.zero_grad() loss.backward() optimizer.step() losses.update(loss.item()) # measure elapsed time batch_time.update(time.time() - end) end = time.time() if cfg.viz and i < cfg.vis_num: visualize_network_output(output, tr_mask, tcl_mask, prefix='train_{}'.format(i)) if i % cfg.display_freq == 0: print( 'Epoch: [ {} ][ {:03d} / {:03d} ] - Loss: {:.4f} - tr_loss: {:.4f} - tcl_loss: {:.4f} - sin_loss: {:.4f} - cos_loss: {:.4f} - radii_loss: {:.4f} - {:.2f}s/step' .format(epoch, i, len(train_loader), loss.item(), tr_loss.item(), tcl_loss.item(), sin_loss.item(), cos_loss.item(), radii_loss.item(), batch_time.avg)) # write summary if total_iter % cfg.summary_freq == 0: print('Summary in {}'.format( os.path.join(cfg.summary_dir, cfg.exp_name))) tr_pred = output[:, 0:2].softmax(dim=1)[:, 1:2] tcl_pred = output[:, 2:4].softmax(dim=1)[:, 1:2] summary_writer.add_image('input_image', vutils.make_grid(img, normalize=True), total_iter) summary_writer.add_image( 'tr/tr_pred', vutils.make_grid(tr_pred * 255, normalize=True), total_iter) summary_writer.add_image( 'tr/tr_mask', vutils.make_grid( torch.unsqueeze(tr_mask * train_mask, 1) * 255), total_iter) summary_writer.add_image( 'tcl/tcl_pred', vutils.make_grid(tcl_pred * 255, normalize=True), total_iter) summary_writer.add_image( 'tcl/tcl_mask', vutils.make_grid( torch.unsqueeze(tcl_mask * train_mask, 1) * 255), total_iter) summary_writer.add_scalar('learning_rate', optimizer.param_groups[0]['lr'], total_iter) summary_writer.add_scalar('model/tr_loss', tr_loss.item(), total_iter) summary_writer.add_scalar('model/tcl_loss', tcl_loss.item(), total_iter) summary_writer.add_scalar('model/sin_loss', sin_loss.item(), total_iter) summary_writer.add_scalar('model/cos_loss', cos_loss.item(), total_iter) summary_writer.add_scalar('model/radii_loss', radii_loss.item(), total_iter) summary_writer.add_scalar('model/loss', loss.item(), total_iter) total_iter += 1 print('Speed: {}s /step, {}s /epoch'.format(batch_time.avg, time.time() - start)) if epoch % cfg.save_freq == 0: save_model(model, optimizer, scheduler, epoch) print('Training Loss: {}'.format(losses.avg))
def _evaluate(data_loader, model, criterion, writer, epoch, logging_label, no_cuda=False, log_interval=10, **kwargs): """ The evaluation routine Parameters ---------- data_loader : torch.utils.data.DataLoader The dataloader of the evaluation set model : torch.nn.module The network model being used criterion: torch.nn.loss The loss function used to compute the loss of the model writer : tensorboardX.writer.SummaryWriter The tensorboard writer object. Used to log values on file for the tensorboard visualization. epoch : int Number of the epoch (for logging purposes) logging_label : string Label for logging purposes. Typically 'test' or 'valid'. Its prepended to the logging output path and messages. no_cuda : boolean Specifies whether the GPU should be used or not. A value of 'True' means the CPU will be used. log_interval : int Interval limiting the logging of mini-batches. Default value of 10. Returns ------- top1.avg : float Accuracy of the model of the evaluated split """ multi_run = kwargs['run'] if 'run' in kwargs else None # Instantiate the counters batch_time = AverageMeter() losses = AverageMeter() top1 = AverageMeter() data_time = AverageMeter() # Switch to evaluate mode (turn off dropout & such ) model.eval() # Iterate over whole evaluation set end = time.time() # Empty lists to store the predictions and target values preds = [] targets = [] pbar = tqdm(enumerate(data_loader), total=len(data_loader), unit='batch', ncols=150, leave=False) for batch_idx, (input, target) in pbar: # Measure data loading time data_time.update(time.time() - end) # Moving data to GPU if not no_cuda: input = input.cuda(async=True) target = target.cuda(async=True) # Convert the input and its labels to Torch Variables input_var = torch.autograd.Variable(input, volatile=True) target_var = torch.autograd.Variable(target, volatile=True) # Compute output output = model(input_var) # Compute and record the loss loss = criterion(output, target_var) losses.update(loss.data[0], input.size(0)) # Apply sigmoid and take everything above a threshold of 0.5 squashed_output = torch.nn.Sigmoid()(output).data.cpu().numpy() target_vals = target.cpu().numpy().astype(np.int) # jss = compute_jss(target_vals, get_preds_from_minibatch(squashed_output)) # top1.update(jss, input.size(0)) # Store results of each minibatch _ = [ preds.append(item) for item in get_preds_from_minibatch(squashed_output) ] _ = [targets.append(item) for item in target.cpu().numpy()] # Add loss and accuracy to Tensorboard if multi_run is None: writer.add_scalar(logging_label + '/mb_loss', loss.data[0], epoch * len(data_loader) + batch_idx) # writer.add_scalar(logging_label + '/mb_jaccard_similarity', jss, epoch * len(data_loader) + batch_idx) else: writer.add_scalar(logging_label + '/mb_loss_{}'.format(multi_run), loss.data[0], epoch * len(data_loader) + batch_idx) # writer.add_scalar(logging_label + '/mb_jaccard_similarity_{}'.format(multi_run), jss, # epoch * len(data_loader) + batch_idx) # Measure elapsed time batch_time.update(time.time() - end) end = time.time() if batch_idx % log_interval == 0: pbar.set_description(logging_label + ' epoch [{0}][{1}/{2}]\t'.format( epoch, batch_idx, len(data_loader))) pbar.set_postfix( Time='{batch_time.avg:.3f}\t'.format(batch_time=batch_time), Loss='{loss.avg:.4f}\t'.format(loss=losses), # JSS='{top1.avg:.3f}\t'.format(top1=top1), Data='{data_time.avg:.3f}\t'.format(data_time=data_time)) # Generate a classification report for each epoch targets = np.array(targets).astype(np.int) preds = np.array(preds).astype(np.int) _log_classification_report(data_loader, epoch, preds, targets, writer) jss_epoch = compute_jss(targets, preds) # try: # np.testing.assert_approx_equal(jss_epoch, top1.avg) # except: # logging.error('Computed JSS scores do not match') # logging.error('JSS: {} Avg: {}'.format(jss_epoch, top1.avg)) # # Logging the epoch-wise JSS if multi_run is None: writer.add_scalar(logging_label + '/loss', losses.avg, epoch) writer.add_scalar(logging_label + '/jaccard_similarity', jss_epoch, epoch) else: writer.add_scalar(logging_label + '/loss_{}'.format(multi_run), losses.avg, epoch) writer.add_scalar( logging_label + '/jaccard_similarity_{}'.format(multi_run), jss_epoch, epoch) logging.info( _prettyprint_logging_label(logging_label) + ' epoch[{}]: ' 'JSS={jss_epoch:.3f}\t' 'Loss={loss.avg:.4f}\t' 'Batch time={batch_time.avg:.3f} ({data_time.avg:.3f} to load data)'. format(epoch, batch_time=batch_time, data_time=data_time, loss=losses, jss_epoch=jss_epoch)) return jss_epoch
def train(train_loader, model, criterion, optimizer, observer, observer_criterion, observer_optimizer, writer, epoch, no_cuda=False, log_interval=25, **kwargs): """ Training routine Parameters ---------- :param train_loader : torch.utils.data.DataLoader The dataloader of the train set. :param model : torch.nn.module The network model being used. :param criterion : torch.nn.loss The loss function used to compute the loss of the model. :param optimizer : torch.optim The optimizer used to perform the weight update. :param writer : tensorboardX.writer.SummaryWriter The tensorboard writer object. Used to log values on file for the tensorboard visualization. :param epoch : int Number of the epoch (for logging purposes). :param no_cuda : boolean Specifies whether the GPU should be used or not. A value of 'True' means the CPU will be used. :param log_interval : int Interval limiting the logging of mini-batches. Default value of 10. :return: None """ multi_run = kwargs['run'] if 'run' in kwargs else None # Instantiate the counters batch_time = AverageMeter() loss_meter = AverageMeter() observer_loss_meter = AverageMeter() acc_meter = AverageMeter() observer_acc_meter = AverageMeter() data_time = AverageMeter() # Switch to train mode (turn on dropout & stuff) model.train() # Create a random object random_seed = 42 random1 = np.random.RandomState(random_seed) num_classes = observer.module.output_channels # Iterate over whole training set end = time.time() pbar = tqdm(enumerate(train_loader), total=len(train_loader), unit='batch', ncols=150, leave=False) for batch_idx, (input, target) in pbar: # Measure data loading time data_time.update(time.time() - end) # Generate the shuffled labels # random1 = np.random.RandomState(random_seed) random_target = torch.LongTensor(random1.randint(0, num_classes, len(input))) # Moving data to GPU if not no_cuda: input = input.cuda(async=True) target = target.cuda(async=True) random_target = random_target.cuda(async=True) # Convert the input and its labels to Torch Variables input_var = torch.autograd.Variable(input) random_target_var = torch.autograd.Variable(random_target) acc, loss = train_one_mini_batch(model, criterion, optimizer, input_var, random_target_var, loss_meter, acc_meter) # Update random if necessary # if acc[0] > 80: # logging.info('Random seed updated!') # random_seed = random1.randint(0) input_features_var = torch.autograd.Variable(model.module.features.data) target_var = torch.autograd.Variable(target) observer_acc, observer_loss = train_one_mini_batch(observer, observer_criterion, observer_optimizer, input_features_var, target_var, observer_loss_meter, observer_acc_meter) # Add loss and accuracy to Tensorboard if multi_run is None: writer.add_scalar('train/mb_loss', loss.data[0], epoch * len(train_loader) + batch_idx) writer.add_scalar('train/mb_accuracy', acc.cpu().numpy(), epoch * len(train_loader) + batch_idx) writer.add_scalar('train/obs_mb_loss', observer_loss.data[0], epoch * len(train_loader) + batch_idx) writer.add_scalar('train/obs_mb_accuracy', observer_acc.cpu().numpy(), epoch * len(train_loader) + batch_idx) else: writer.add_scalar('train/mb_loss_{}'.format(multi_run), loss.data[0], epoch * len(train_loader) + batch_idx) writer.add_scalar('train/mb_accuracy_{}'.format(multi_run), acc.cpu().numpy(), epoch * len(train_loader) + batch_idx) writer.add_scalar('train/obs_mb_loss_{}'.format(multi_run), observer_loss.data[0], epoch * len(train_loader) + batch_idx) writer.add_scalar('train/obs_mb_accuracy_{}'.format(multi_run), observer_acc.cpu().numpy(), epoch * len(train_loader) + batch_idx) # Measure elapsed time batch_time.update(time.time() - end) end = time.time() # Log to console if batch_idx % log_interval == 0: pbar.set_description('train epoch [{0}][{1}/{2}]\t'.format(epoch, batch_idx, len(train_loader))) pbar.set_postfix(Time='{batch_time.avg:.3f}\t'.format(batch_time=batch_time), Loss='{loss.avg:.4f}\t'.format(loss=loss_meter), Acc1='{acc_meter.avg:.3f}\t'.format(acc_meter=acc_meter), Data='{data_time.avg:.3f}\t'.format(data_time=data_time)) # Logging the epoch-wise accuracy if multi_run is None: writer.add_scalar('train/accuracy', acc_meter.avg, epoch) writer.add_scalar('train/obs_accuracy', observer_acc_meter.avg, epoch) else: writer.add_scalar('train/accuracy_{}'.format(multi_run), acc_meter.avg, epoch) writer.add_scalar('train/obs_accuracy_{}'.format(multi_run), observer_acc_meter.avg, epoch) logging.debug('Train epoch[{}]: ' 'Acc@1={acc_meter.avg:.3f}\t' 'Loss={loss.avg:.4f}\t' 'Batch time={batch_time.avg:.3f} ({data_time.avg:.3f} to load data)' .format(epoch, batch_time=batch_time, data_time=data_time, loss=loss_meter, acc_meter=acc_meter)) return acc_meter.avg
def validate(val_loader, model, criterion): with torch.no_grad(): model.eval() batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() running_metric_text = runningScore(2) running_metric_kernel = runningScore(2) end = time.time() for batch_idx, (imgs, gt_texts, gt_kernels, training_masks) in enumerate(val_loader): data_time.update(time.time() - end) imgs = Variable(imgs.cuda()) gt_texts = Variable(gt_texts.cuda()) gt_kernels = Variable(gt_kernels.cuda()) training_masks = Variable(training_masks.cuda()) outputs = model(imgs) texts = outputs[:, 0, :, :] kernels = outputs[:, 1:, :, :] selected_masks = ohem_batch(texts, gt_texts, training_masks) selected_masks = Variable(selected_masks.cuda()) loss_text = criterion(texts, gt_texts, selected_masks) loss_kernels = [] mask0 = torch.sigmoid(texts).data.cpu().numpy() mask1 = training_masks.data.cpu().numpy() selected_masks = ((mask0 > 0.5) & (mask1 > 0.5)).astype('float32') selected_masks = torch.from_numpy(selected_masks).float() selected_masks = Variable(selected_masks.cuda()) for i in range(6): kernel_i = kernels[:, i, :, :] gt_kernel_i = gt_kernels[:, i, :, :] loss_kernel_i = criterion(kernel_i, gt_kernel_i, selected_masks) loss_kernels.append(loss_kernel_i) loss_kernel = sum(loss_kernels) / len(loss_kernels) loss = 0.7 * loss_text + 0.3 * loss_kernel losses.update(loss.item(), imgs.size(0)) score_text = cal_text_score(texts, gt_texts, training_masks, running_metric_text) score_kernel = cal_kernel_score(kernels, gt_kernels, gt_texts, training_masks, running_metric_kernel) batch_time.update(time.time() - end) end = time.time() if batch_idx % 5 == 0: output_log = '({batch}/{size}) Batch: {bt:.3f}s | TOTAL: {total:.0f}min | ETA: {eta:.0f}min '.format( batch=batch_idx + 1, size=len(val_loader), bt=batch_time.avg, total=batch_time.avg * batch_idx / 60.0, eta=batch_time.avg * (len(val_loader) - batch_idx) / 60.0) print(output_log) sys.stdout.flush() return (float(losses.avg), float(score_text['Mean Acc']), float(score_kernel['Mean Acc']), float(score_text['Mean IoU']), float(score_kernel['Mean IoU']))
def train(train_loader, model, criterion, optimizer, writer, epoch, no_cuda=False, log_interval=25, **kwargs): """ Training routine Parameters ---------- train_loader : torch.utils.data.DataLoader The dataloader of the train set. model : torch.nn.module The network model being used. criterion : torch.nn.loss The loss function used to compute the loss of the model. optimizer : torch.optim The optimizer used to perform the weight update. writer : tensorboardX.writer.SummaryWriter The tensorboard writer object. Used to log values on file for the tensorboard visualization. epoch : int Number of the epoch (for logging purposes). no_cuda : boolean Specifies whether the GPU should be used or not. A value of 'True' means the CPU will be used. log_interval : int Interval limiting the logging of mini-batches. Default value of 10. Returns ---------- top1.avg : float Accuracy of the model of the evaluated split """ multi_run = kwargs['run'] if 'run' in kwargs else None # Instantiate the counters batch_time = AverageMeter() loss_meter = AverageMeter() jss_meter = AverageMeter() data_time = AverageMeter() # Switch to train mode (turn on dropout & stuff) model.train() # Empty lists to store the predictions and target values preds = [] targets = [] # Iterate over whole training set end = time.time() pbar = tqdm(enumerate(train_loader), total=len(train_loader), unit='batch', ncols=150, leave=False) for batch_idx, (input, target) in pbar: # Measure data loading time data_time.update(time.time() - end) # Moving data to GPU if not no_cuda: input = input.cuda(async=True) target = target.cuda(async=True) # Convert the input and its labels to Torch Variables input_var = torch.autograd.Variable(input) target_var = torch.autograd.Variable(target) jss, loss, target_vals, pred_vals = train_one_mini_batch( model, criterion, optimizer, input_var, target_var, loss_meter, jss_meter) # Store results of each minibatch _ = [preds.append(item) for item in pred_vals] _ = [targets.append(item) for item in target_vals] # Add loss and accuracy to Tensorboard if multi_run is None: writer.add_scalar('train/mb_loss', loss.data[0], epoch * len(train_loader) + batch_idx) # writer.add_scalar('train/mb_jaccard_similarity', jss, epoch * len(train_loader) + batch_idx) else: writer.add_scalar('train/mb_loss_{}'.format(multi_run), loss.data[0], epoch * len(train_loader) + batch_idx) # writer.add_scalar('train/mb_jaccard_similarity_{}'.format(multi_run), jss, # epoch * len(train_loader) + batch_idx) # Measure elapsed time batch_time.update(time.time() - end) end = time.time() # Log to console if batch_idx % log_interval == 0: pbar.set_description('train epoch [{0}][{1}/{2}]\t'.format( epoch, batch_idx, len(train_loader))) pbar.set_postfix( Time='{batch_time.avg:.3f}\t'.format(batch_time=batch_time), Loss='{loss.avg:.4f}\t'.format(loss=loss_meter), # JSS='{jss_meter.avg:.3f}\t'.format(jss_meter=jss_meter), Data='{data_time.avg:.3f}\t'.format(data_time=data_time)) # Generate the epoch wise JSS targets = np.array(targets).astype(np.int) preds = np.array(preds).astype(np.int) jss_epoch = compute_jss(targets, preds) # try: # np.testing.assert_approx_equal(jss_epoch, jss_meter.avg) # except: # logging.error('Computed JSS scores do not match') # logging.error('JSS: {} Avg: {}'.format(jss_epoch, jss_meter.avg)) # Logging the epoch-wise accuracy if multi_run is None: writer.add_scalar('train/loss', loss_meter.avg, epoch) writer.add_scalar('train/jaccard_similarity', jss_epoch, epoch) else: writer.add_scalar('train/loss_{}'.format(multi_run), loss_meter.avg, epoch) writer.add_scalar('train/jaccard_similarity_{}'.format(multi_run), jss_epoch, epoch) logging.debug( 'Train epoch[{}]: ' 'JSS={jss_epoch:.3f}\t' 'Loss={loss.avg:.4f}\t' 'Batch time={batch_time.avg:.3f} ({data_time.avg:.3f} to load data)'. format(epoch, batch_time=batch_time, data_time=data_time, loss=loss_meter, jss_epoch=jss_epoch)) return jss_epoch
def _evaluate(data_loader, model, criterion, writer, epoch, logging_label, no_cuda=False, log_interval=10, **kwargs): """ The evaluation routine Parameters ---------- data_loader : torch.utils.data.DataLoader The dataloader of the evaluation set model : torch.nn.module The network model being used criterion: torch.nn.loss The loss function used to compute the loss of the model writer : tensorboardX.writer.SummaryWriter The tensorboard writer object. Used to log values on file for the tensorboard visualization. epoch : int Number of the epoch (for logging purposes) logging_label : string Label for logging purposes. Typically 'test' or 'valid'. Its prepended to the logging output path and messages. no_cuda : boolean Specifies whether the GPU should be used or not. A value of 'True' means the CPU will be used. log_interval : int Interval limiting the logging of mini-batches. Default value of 10. Returns ------- top1.avg : float Accuracy of the model of the evaluated split """ multi_run = kwargs['run'] if 'run' in kwargs else None # Instantiate the counters batch_time = AverageMeter() losses = AverageMeter() data_time = AverageMeter() # Switch to evaluate mode (turn off dropout & such ) model.eval() # Iterate over whole evaluation set end = time.time() pbar = tqdm(enumerate(data_loader), total=len(data_loader), unit='batch', ncols=150, leave=False) for batch_idx, (input, _) in pbar: # Measure data loading time data_time.update(time.time() - end) # Moving data to GPU if not no_cuda: input = input.cuda(async=True) # Convert the input to Torch Variables input_var = torch.autograd.Variable(input, volatile=True) # Compute output output = model(input_var) # Compute and record the loss loss = criterion(output, input_var) losses.update(loss.data[0], input.size(0)) # Add loss and accuracy to Tensorboard if multi_run is None: writer.add_scalar(logging_label + '/mb_loss', loss.data[0], epoch * len(data_loader) + batch_idx) else: writer.add_scalar(logging_label + '/mb_loss_{}'.format(multi_run), loss.data[0], epoch * len(data_loader) + batch_idx) # Measure elapsed time batch_time.update(time.time() - end) end = time.time() if batch_idx % log_interval == 0: pbar.set_description(logging_label + ' epoch [{0}][{1}/{2}]\t'.format( epoch, batch_idx, len(data_loader))) pbar.set_postfix( Time='{batch_time.avg:.3f}\t'.format(batch_time=batch_time), Loss='{loss.avg:.4f}\t'.format(loss=losses), Data='{data_time.avg:.3f}\t'.format(data_time=data_time)) input_img = torchvision.utils.make_grid(input_var[:25].data.cpu(), nrow=5, normalize=False, scale_each=False).permute( 1, 2, 0).numpy() output_img = torchvision.utils.make_grid(output[:25].data.cpu(), nrow=5, normalize=False, scale_each=False).permute( 1, 2, 0).numpy() save_image_and_log_to_tensorboard(writer, tag=logging_label + '/input_image', image=input_img) save_image_and_log_to_tensorboard(writer, tag=logging_label + '/output_image', image=output_img, global_step=epoch) return losses.avg
def train(train_loader, model, criterion, optimizer, writer, epoch, no_cuda, log_interval=25, **kwargs): """ Training routine Parameters ---------- train_loader : torch.utils.data.DataLoader The dataloader of the train set. model : torch.nn.module The network model being used. criterion : torch.nn.loss The loss function used to compute the loss of the model. optimizer : torch.optim The optimizer used to perform the weight update. writer : tensorboardX.writer.SummaryWriter The tensorboard writer object. Used to log values on file for the tensorboard visualization. epoch : int Number of the epoch (for logging purposes). no_cuda : boolean Specifies whether the GPU should be used or not. A value of 'True' means the CPU will be used. log_interval : int Interval limiting the logging of mini-batches. Default value of 10. Returns ---------- int Placeholder 0. In the future this should become the FPR95 """ multi_run = kwargs['run'] if 'run' in kwargs else None # Instantiate the counters batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() # Switch to train mode (turn on dropout & stuff) model.train() # Iterate over whole training set end = time.time() pbar = tqdm(enumerate(train_loader), total=len(train_loader), unit='batch', ncols=150, leave=False) for batch_idx, (data_a, data_p, data_n) in pbar: if len(data_a.size()) == 5: bs, ncrops, c, h, w = data_a.size() data_a = data_a.view(-1, c, h, w) data_p = data_p.view(-1, c, h, w) data_n = data_n.view(-1, c, h, w) # Measure data loading time data_time.update(time.time() - end) # Moving data to GPU if not no_cuda: data_a, data_p, data_n = data_a.cuda( non_blocking=True), data_p.cuda( non_blocking=True), data_n.cuda(non_blocking=True) # Compute output out_a, out_p, out_n = model(data_a), model(data_p), model(data_n) if len(data_a.size()) == 5: out_a = out_a.view(bs, ncrops, -1).mean(1) out_p = out_p.view(bs, ncrops, -1).mean(1) out_n = out_n.view(bs, ncrops, -1).mean(1) # Compute and record the loss loss = criterion(out_p, out_a, out_n) losses.update(loss.item(), data_a.size(0)) # Reset gradient optimizer.zero_grad() # Compute gradients loss.backward() # Perform a step by updating the weights optimizer.step() # Log to console if batch_idx % log_interval == 0: pbar.set_description( 'Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( epoch, batch_idx * len(data_a), len(train_loader.dataset), 100. * batch_idx / len(train_loader), losses.avg)) # Add mb loss to Tensorboard if multi_run is None: writer.add_scalar('train/mb_loss', loss.item(), epoch * len(train_loader) + batch_idx) else: writer.add_scalar('train/mb_loss_{}'.format(multi_run), loss.item(), epoch * len(train_loader) + batch_idx) # Measure elapsed time batch_time.update(time.time() - end) end = time.time() return 0
def _evaluate(data_loader, model, criterion, writer, epoch, logging_label, no_cuda=False, log_interval=10, **kwargs): """ The evaluation routine Parameters ---------- data_loader : torch.utils.data.DataLoader The dataloader of the evaluation set model : torch.nn.module The network model being used criterion: torch.nn.loss The loss function used to compute the loss of the model writer : tensorboardX.writer.SummaryWriter The tensorboard writer object. Used to log values on file for the tensorboard visualization. epoch : int Number of the epoch (for logging purposes) logging_label : string Label for logging purposes. Typically 'test' or 'valid'. Its prepended to the logging output path and messages. no_cuda : boolean Specifies whether the GPU should be used or not. A value of 'True' means the CPU will be used. log_interval : int Interval limiting the logging of mini-batches. Default value of 10. Returns ------- top1.avg : float Accuracy of the model of the evaluated split """ #TODO All parts computing the accuracy are commented out. It is necessary to #TODO implement a 2D softmax and instead of regressing the output class have it #TODO work with class labels. Notice that, however, it would be #TODO of interest leaving open the possibility to work with soft labels #TODO (e.g. the ground truth for pixel X,Y is an array of probabilities instead #TODO of an integer. multi_run = kwargs['run'] if 'run' in kwargs else None # Instantiate the counters batch_time = AverageMeter() losses = AverageMeter() top1 = AverageMeter() data_time = AverageMeter() # Switch to evaluate mode (turn off dropout & such ) model.eval() # Iterate over whole evaluation set end = time.time() # Empty lists to store the predictions and target values preds = [] targets = [] pbar = tqdm(enumerate(data_loader), total=len(data_loader), unit='batch', ncols=150, leave=False) for batch_idx, (input, _) in pbar: # Measure data loading time data_time.update(time.time() - end) # Moving data to GPU if not no_cuda: input = input.cuda(async=True) # Split the data into halves to separate the input from the GT satel_image, map_image = torch.chunk(input, chunks=2, dim=3) # Convert the input and its labels to Torch Variables input_var = torch.autograd.Variable(satel_image) target_var = torch.autograd.Variable(map_image) # Compute output output = model(input_var) # Compute and record the loss loss = criterion(output, target_var) losses.update(loss.data[0], input.size(0)) # Compute and record the accuracy # acc1 = accuracy(output.data, target, topk=(1,))[0] # top1.update(acc1[0], input.size(0)) # Get the predictions # _ = [preds.append(item) for item in [np.argmax(item) for item in output.data.cpu().numpy()]] # _ = [targets.append(item) for item in target.cpu().numpy()] # Add loss and accuracy to Tensorboard if multi_run is None: writer.add_scalar(logging_label + '/mb_loss', loss.data[0], epoch * len(data_loader) + batch_idx) # writer.add_scalar(logging_label + '/mb_accuracy', acc1.cpu().numpy(), epoch * len(data_loader) + batch_idx) else: writer.add_scalar(logging_label + '/mb_loss_{}'.format(multi_run), loss.data[0], epoch * len(data_loader) + batch_idx) # writer.add_scalar(logging_label + '/mb_accuracy_{}'.format(multi_run), acc1.cpu().numpy(), # epoch * len(data_loader) + batch_idx) # Measure elapsed time batch_time.update(time.time() - end) end = time.time() if batch_idx % log_interval == 0: pbar.set_description(logging_label + ' epoch [{0}][{1}/{2}]\t'.format( epoch, batch_idx, len(data_loader))) pbar.set_postfix( Time='{batch_time.avg:.3f}\t'.format(batch_time=batch_time), Loss='{loss.avg:.4f}\t'.format(loss=losses), # Acc1='{top1.avg:.3f}\t'.format(top1=top1), Data='{data_time.avg:.3f}\t'.format(data_time=data_time)) # Logging the epoch-wise accuracy if multi_run is None: # writer.add_scalar(logging_label + '/accuracy', top1.avg, epoch) save_image_and_log_to_tensorboard(writer, tag=logging_label + '/output', image=output[:1], global_step=epoch) save_image_and_log_to_tensorboard(writer, tag=logging_label + '/input', image=satel_image[:1]) save_image_and_log_to_tensorboard(writer, tag=logging_label + '/target', image=map_image[:1]) else: # writer.add_scalar(logging_label + '/accuracy_{}'.format(multi_run), top1.avg, epoch) save_image_and_log_to_tensorboard(writer, tag=logging_label + '/output_{}'.format(multi_run), image=output[:1], global_step=epoch) save_image_and_log_to_tensorboard(writer, tag=logging_label + '/input_{}'.format(multi_run), image=satel_image[:1]) save_image_and_log_to_tensorboard(writer, tag=logging_label + '/target', image=map_image[:1]) logging.info( _prettyprint_logging_label(logging_label) + ' epoch[{}]: ' # 'Acc@1={top1.avg:.3f}\t' 'Loss={loss.avg:.4f}\t' 'Batch time={batch_time.avg:.3f} ({data_time.avg:.3f} to load data)'. format(epoch, batch_time=batch_time, data_time=data_time, loss=losses, top1=top1))
def validate(val_loader, model, criterion, writer, epoch, class_encodings, no_cuda=False, log_interval=10, **kwargs): """ The evaluation routine Parameters ---------- val_loader : torch.utils.data.DataLoader The dataloader of the evaluation set model : torch.nn.module The network model being used criterion: torch.nn.loss The loss function used to compute the loss of the model writer : tensorboardX.writer.SummaryWriter The tensorboard writer object. Used to log values on file for the tensorboard visualization. epoch : int Number of the epoch (for logging purposes) class_encodings : List Contains the classes (range of ints) no_cuda : boolean Specifies whether the GPU should be used or not. A value of 'True' means the CPU will be used. log_interval : int Interval limiting the logging of mini-batches. Default value of 10. Returns ------- meanIU.avg : float MeanIU of the model of the evaluated split """ # 'Run' is injected in kwargs at runtime IFF it is a multi-run event multi_run = kwargs['run'] if 'run' in kwargs else None num_classes = len(class_encodings) # Instantiate the counters batch_time = AverageMeter() losses = AverageMeter() meanIU = AverageMeter() data_time = AverageMeter() # Switch to evaluate mode (turn off dropout & such ) model.eval() # Iterate over whole evaluation set end = time.time() pbar = tqdm(enumerate(val_loader), total=len(val_loader), unit='batch', ncols=150, leave=False) for batch_idx, (input, target) in pbar: # Measure data loading time data_time.update(time.time() - end) # Moving data to GPU if not no_cuda: input = input.cuda(non_blocking=True) target = target.cuda(non_blocking=True) # Compute output output = model(input) # Compute and record the loss loss = criterion(output, target) losses.update(loss.item(), input.size(0)) # Compute and record the accuracy _, _, mean_iu_batch, _ = accuracy_segmentation(target.cpu().numpy(), get_argmax(output), num_classes) meanIU.update(mean_iu_batch, input.size(0)) # Add loss and meanIU to Tensorboard scalar_label = 'val/mb_loss' if multi_run is None else 'val/mb_loss_{}'.format(multi_run) writer.add_scalar(scalar_label, loss.item(), epoch * len(val_loader) + batch_idx) scalar_label = 'val/mb_meanIU' if multi_run is None else 'val/mb_meanIU_{}'.format(multi_run) writer.add_scalar(scalar_label, mean_iu_batch, epoch * len(val_loader) + batch_idx) # Measure elapsed time batch_time.update(time.time() - end) end = time.time() if batch_idx % log_interval == 0: pbar.set_description('val epoch [{0}][{1}/{2}]\t'.format(epoch, batch_idx, len(val_loader))) pbar.set_postfix(Time='{batch_time.avg:.3f}\t'.format(batch_time=batch_time), Loss='{loss.avg:.4f}\t'.format(loss=losses), meanIU='{meanIU.avg:.3f}\t'.format(meanIU=meanIU), Data='{data_time.avg:.3f}\t'.format(data_time=data_time)) # Logging the epoch-wise meanIU scalar_label = 'val/meanIU' if multi_run is None else 'val/meanIU_{}'.format(multi_run) writer.add_scalar(scalar_label, meanIU.avg, epoch) logging.info(_prettyprint_logging_label("val") + ' epoch[{}]: ' 'MeanIU={meanIU.avg:.3f}\t' 'Loss={loss.avg:.4f}\t' 'Batch time={batch_time.avg:.3f} ({data_time.avg:.3f} to load data)' .format(epoch, batch_time=batch_time, data_time=data_time, loss=losses, meanIU=meanIU)) return meanIU.avg
def test(test_loader, model, criterion, writer, epoch, class_encodings, img_names_sizes_dict, dataset_folder, post_process, no_cuda=False, log_interval=10, **kwargs): """ The evaluation routine Parameters ---------- img_names_sizes_dict: dictionary {str: (int, int)} Key: gt image name (with extension), Value: image size test_loader : torch.utils.data.DataLoader The dataloader of the evaluation set model : torch.nn.module The network model being used criterion: torch.nn.loss The loss function used to compute the loss of the model writer : tensorboardX.writer.SummaryWriter The tensorboard writer object. Used to log values on file for the tensorboard visualization. epoch : int Number of the epoch (for logging purposes) class_encodings : List Contains the range of encoded classes img_names_sizes_dict # TODO dataset_folder : str # TODO post_process : Boolean apply post-processing to the output of the network no_cuda : boolean Specifies whether the GPU should be used or not. A value of 'True' means the CPU will be used. log_interval : int Interval limiting the logging of mini-batches. Default value of 10. Returns ------- meanIU.avg : float MeanIU of the model of the evaluated split """ # 'Run' is injected in kwargs at runtime IFF it is a multi-run event multi_run = kwargs['run'] if 'run' in kwargs else None num_classes = len(class_encodings) # Instantiate the counters batch_time = AverageMeter() losses = AverageMeter() meanIU = AverageMeter() data_time = AverageMeter() # Switch to evaluate mode (turn off dropout & such ) model.eval() # Iterate over whole evaluation set end = time.time() # Need to store the images currently being processes canvas = {} pbar = tqdm(enumerate(test_loader), total=len(test_loader), unit='batch', ncols=150, leave=False) for batch_idx, (input, target) in pbar: # Unpack input input, top_left_coordinates, test_img_names = input # Measure data loading time data_time.update(time.time() - end) # Moving data to GPU if not no_cuda: input = input.cuda(non_blocking=True) target = target.cuda(non_blocking=True) # Compute output output = model(input) # Compute and record the loss loss = criterion(output, target) losses.update(loss.item(), input.size(0)) # Compute and record the batch meanIU _, _, mean_iu_batch, _ = accuracy_segmentation(target.cpu().numpy(), get_argmax(output), num_classes) # Add loss and meanIU to Tensorboard scalar_label = 'test/mb_loss' if multi_run is None else 'test/mb_loss_{}'.format(multi_run) writer.add_scalar(scalar_label, loss.item(), epoch * len(test_loader) + batch_idx) scalar_label = 'test/mb_meanIU' if multi_run is None else 'test/mb_meanIU_{}'.format(multi_run) writer.add_scalar(scalar_label, mean_iu_batch, epoch * len(test_loader) + batch_idx) # Measure elapsed time batch_time.update(time.time() - end) end = time.time() if batch_idx % log_interval == 0: pbar.set_description('test epoch [{0}][{1}/{2}]\t'.format(epoch, batch_idx, len(test_loader))) pbar.set_postfix(Time='{batch_time.avg:.3f}\t'.format(batch_time=batch_time), Loss='{loss.avg:.4f}\t'.format(loss=losses), meanIU='{meanIU.avg:.3f}\t'.format(meanIU=meanIU), Data='{data_time.avg:.3f}\t'.format(data_time=data_time)) # Output needs to be patched together to form the complete output of the full image # patches are returned as a sliding window over the full image, overlapping sections are averaged for patch, x, y, img_name in zip(output.data.cpu().numpy(), top_left_coordinates[0].numpy(), top_left_coordinates[1].numpy(), test_img_names): # Is a new image? if not img_name in canvas: # Create a new image of the right size filled with NaNs canvas[img_name] = np.empty((num_classes, *img_names_sizes_dict[img_name])) canvas[img_name].fill(np.nan) # Add the patch to the image canvas[img_name] = merge_patches(patch, (x, y), canvas[img_name]) # Save the image when done if not np.isnan(np.sum(canvas[img_name])): # Save the final image mean_iu = process_full_image(img_name, canvas[img_name], multi_run, dataset_folder, class_encodings, post_process) # Update the meanIU meanIU.update(mean_iu, 1) # Remove the entry canvas.pop(img_name) logging.info("\nProcessed image {} with mean IU={}".format(img_name, mean_iu)) # Canvas MUST be empty or something was wrong with coverage of all images assert len(canvas) == 0 # Logging the epoch-wise meanIU scalar_label = 'test/mb_meanIU' if multi_run is None else 'test/mb_meanIU_{}'.format(multi_run) writer.add_scalar(scalar_label, meanIU.avg, epoch) logging.info(_prettyprint_logging_label("test") + ' epoch[{}]: ' 'MeanIU={meanIU.avg:.3f}\t' 'Loss={loss.avg:.4f}\t' 'Batch time={batch_time.avg:.3f} ({data_time.avg:.3f} to load data)' .format(epoch, batch_time=batch_time, data_time=data_time, loss=losses, meanIU=meanIU)) return meanIU.avg
def validation(model, valid_loader, criterion, epoch, logger): with torch.no_grad(): model.eval() losses = AverageMeter() tr_losses = AverageMeter() tcl_losses = AverageMeter() sin_losses = AverageMeter() cos_losses = AverageMeter() radii_losses = AverageMeter() for i, (img, train_mask, tr_mask, tcl_mask, radius_map, sin_map, cos_map, meta) in enumerate(valid_loader): img, train_mask, tr_mask, tcl_mask, radius_map, sin_map, cos_map = to_device( img, train_mask, tr_mask, tcl_mask, radius_map, sin_map, cos_map) output = model(img) #output模型预测7通道分数图,tr_mask, tcl_mask, sin_map, cos_map, radius_map, train_mask为标签 tr_loss, tcl_loss, sin_loss, cos_loss, radii_loss = \ criterion(output, tr_mask, tcl_mask, sin_map, cos_map, radius_map, train_mask) loss = tr_loss + tcl_loss + sin_loss + cos_loss + radii_loss # update losses losses.update(loss.item()) tr_losses.update(tr_loss.item()) tcl_losses.update(tcl_loss.item()) sin_losses.update(sin_loss.item()) cos_losses.update(cos_loss.item()) radii_losses.update(radii_loss.item()) if cfg.viz and i % cfg.viz_freq == 0: visualize_network_output(output, tr_mask, tcl_mask, mode='val') if i % cfg.display_freq == 0: print( 'Validation: - Loss: {:.4f} - tr_loss: {:.4f} - tcl_loss: {:.4f} - sin_loss: {:.4f} - cos_loss: {:.4f} - radii_loss: {:.4f}' .format(loss.item(), tr_loss.item(), tcl_loss.item(), sin_loss.item(), cos_loss.item(), radii_loss.item())) logger.write_scalars( { 'loss': losses.avg, 'tr_loss': tr_losses.avg, 'tcl_loss': tcl_losses.avg, 'sin_loss': sin_losses.avg, 'cos_loss': cos_losses.avg, 'radii_loss': radii_losses.avg }, tag='val', n_iter=epoch) print('Validation Loss: {}'.format(losses.avg))
def train(train_loader, model, criterion, optimizer, writer, epoch, class_encodings, no_cuda=False, log_interval=25, **kwargs): """ Training routine Parameters ---------- train_loader : torch.utils.data.DataLoader The dataloader of the train set. model : torch.nn.module The network model being used. criterion : torch.nn.loss The loss function used to compute the loss of the model. optimizer : torch.optim The optimizer used to perform the weight update. writer : tensorboardX.writer.SummaryWriter The tensorboard writer object. Used to log values on file for the tensorboard visualization. epoch : int Number of the epoch (for logging purposes). no_cuda : boolean Specifies whether the GPU should be used or not. A value of 'True' means the CPU will be used. log_interval : int Interval limiting the logging of mini-batches. Default value of 10. Returns ---------- meanIU.avg : float meanIU of the model of the evaluated split """ multi_run = kwargs['run'] if 'run' in kwargs else None num_classes = len(class_encodings) # Instantiate the counters batch_time = AverageMeter() loss_meter = AverageMeter() meanIU = AverageMeter() data_time = AverageMeter() # Switch to train mode (turn on dropout & stuff) model.train() # Iterate over whole training set end = time.time() pbar = tqdm(enumerate(train_loader), total=len(train_loader), unit='batch', ncols=150, leave=False) for batch_idx, (input, target) in pbar: # Measure data loading time data_time.update(time.time() - end) # Moving data to GPU if not no_cuda: input = input.cuda(non_blocking=True) target = target.cuda(non_blocking=True) mean_iu, loss = train_one_mini_batch(model, criterion, optimizer, input, target, loss_meter, meanIU, num_classes) # Add loss and accuracy to Tensorboard log_loss = loss.item() if multi_run is None: writer.add_scalar('train/mb_loss', log_loss, epoch * len(train_loader) + batch_idx) writer.add_scalar('train/mb_meanIU', mean_iu, epoch * len(train_loader) + batch_idx) else: writer.add_scalar('train/mb_loss_{}'.format(multi_run), log_loss, epoch * len(train_loader) + batch_idx) writer.add_scalar('train/mb_meanIU_{}'.format(multi_run), mean_iu, epoch * len(train_loader) + batch_idx) # Measure elapsed time batch_time.update(time.time() - end) end = time.time() # Log to console if batch_idx % log_interval == 0: pbar.set_description('train epoch [{0}][{1}/{2}]\t'.format( epoch, batch_idx, len(train_loader))) pbar.set_postfix( Time='{batch_time.avg:.3f}\t'.format(batch_time=batch_time), Loss='{loss.avg:.4f}\t'.format(loss=loss_meter), meanIU='{meanIU.avg:.3f}\t'.format(meanIU=meanIU), Data='{data_time.avg:.3f}\t'.format(data_time=data_time)) # Logging the epoch-wise accuracy if multi_run is None: writer.add_scalar('train/meanIU', meanIU.avg, epoch) else: writer.add_scalar('train/meanIU_{}'.format(multi_run), meanIU.avg, epoch) logging.debug( 'Train epoch[{}]: ' 'MeanIU={meanIU.avg:.3f}\t' 'Loss={loss.avg:.4f}\t' 'Batch time={batch_time.avg:.3f} ({data_time.avg:.3f} to load data)'. format(epoch, batch_time=batch_time, data_time=data_time, loss=loss_meter, meanIU=meanIU)) # logging.info(_prettyprint_logging_label("train") + # ' epoch[{}]: ' # 'MeanIU={meanIU.avg:.3f}\t' # 'Loss={loss.avg:.4f}\t' # 'Batch time={batch_time.avg:.3f} ({data_time.avg:.3f} to load data)' # .format(epoch, batch_time=batch_time, data_time=data_time, loss=loss_meter, meanIU=meanIU)) return meanIU.avg
def train(model, train_loader, criterion, scheduler, optimizer, epoch, logger): global train_step losses = AverageMeter() batch_time = AverageMeter() data_time = AverageMeter() end = time.time() model.train() scheduler.step() print('Epoch: {} : LR = {}'.format(epoch, lr)) for i, (img, train_mask, tr_mask, tcl_mask, radius_map, sin_map, cos_map, meta) in enumerate(train_loader): data_time.update(time.time() - end) train_step += 1 img, train_mask, tr_mask, tcl_mask, radius_map, sin_map, cos_map = to_device( img, train_mask, tr_mask, tcl_mask, radius_map, sin_map, cos_map) # 模型输出 output = model(img) # loss 计算 tr_loss, tcl_loss, sin_loss, cos_loss, radii_loss = \ criterion(output, tr_mask, tcl_mask, sin_map, cos_map, radius_map, train_mask) loss = tr_loss + tcl_loss + sin_loss + cos_loss + radii_loss # backward # 每次迭代清空上一次的梯度 optimizer.zero_grad() # 反向传播 loss.backward() # 更新梯度 optimizer.step() # 更新loss losses.update(loss.item()) # 计算耗时 batch_time.update(time.time() - end) end = time.time() if cfg.viz and i % cfg.viz_freq == 0: visualize_network_output(output, tr_mask, tcl_mask, mode='train') if i % cfg.display_freq == 0: print( '({:d} / {:d}) - Loss: {:.4f} - tr_loss: {:.4f} - tcl_loss: {:.4f} - sin_loss: {:.4f} - cos_loss: {:.4f} - radii_loss: {:.4f}' .format(i, len(train_loader), loss.item(), tr_loss.item(), tcl_loss.item(), sin_loss.item(), cos_loss.item(), radii_loss.item())) if i % cfg.log_freq == 0: logger.write_scalars( { 'loss': loss.item(), 'tr_loss': tr_loss.item(), 'tcl_loss': tcl_loss.item(), 'sin_loss': sin_loss.item(), 'cos_loss': cos_loss.item(), 'radii_loss': radii_loss.item() }, tag='train', n_iter=train_step) if epoch % cfg.save_freq == 0: save_model(model, epoch, scheduler.get_lr(), optimizer) print('Training Loss: {}'.format(losses.avg))
def train(self, model, train_loader, criterion, scheduler, optimizer, epoch, logger, train_step): losses = AverageMeter() batch_time = AverageMeter() data_time = AverageMeter() end = time.time() model.train() scheduler.step() lr = scheduler.get_lr()[0] print('Epoch: {} : LR = {}'.format(epoch, lr)) for i, (img, train_mask, tr_mask, tcl_mask, radius_map, sin_map, cos_map, meta) in enumerate(train_loader): data_time.update(time.time() - end) train_step += 1 img, train_mask, tr_mask, tcl_mask, radius_map, sin_map, cos_map = to_device( img, train_mask, tr_mask, tcl_mask, radius_map, sin_map, cos_map) output = model(img) tr_loss, tcl_loss, sin_loss, cos_loss, radii_loss = \ criterion(output, tr_mask, tcl_mask, sin_map, cos_map, radius_map, train_mask) loss = tr_loss + tcl_loss + sin_loss + cos_loss + radii_loss # backward optimizer.zero_grad() loss.backward() optimizer.step() losses.update(loss.item()) # measure elapsed time batch_time.update(time.time() - end) end = time.time() if cfg.viz and i % cfg.viz_freq == 0: visualize_network_output(output, tr_mask, tcl_mask, mode='train') if i % cfg.display_freq == 0: #print(loss.item()) #print(tr_loss.item()) #print(tcl_loss.item()) #print(sin_loss.item()) #print(cos_loss.item()) #print(radii_loss.item()) try: print( '({:d} / {:d}) - Loss: {:.4f} - tr_loss: {:.4f} - tcl_loss: {:.4f} - sin_loss: {:.4f} - cos_loss: {:.4f} - radii_loss: {:.4f}' .format(i, len(train_loader), loss.item(), tr_loss.item(), tcl_loss.item(), sin_loss.item(), cos_loss.item(), radii_loss.item())) except: print('({:d} / {:d}) - Loss: {:.4f} - tr_loss: {:.4f}'. format(i, len(train_loader), loss.item(), tr_loss.item())) if i % cfg.log_freq == 0: try: logger.write_scalars( { 'loss': loss.item(), 'tr_loss': tr_loss.item(), 'tcl_loss': tcl_loss.item(), 'sin_loss': sin_loss.item(), 'cos_loss': cos_loss.item(), 'radii_loss': radii_loss.item() }, tag='train', n_iter=train_step) except: logger.write_scalars( { 'loss': loss.item(), 'tr_loss': tr_loss.item() }, tag='train', n_iter=train_step) if epoch % cfg.save_freq == 0: self.save_model(model, epoch, scheduler.get_lr(), optimizer) print('Training Loss: {}'.format(losses.avg)) return train_step
def _evaluate(data_loader, model, criterion, writer, epoch, logging_label, no_cuda=False, log_interval=10, **kwargs): """ The evaluation routine Parameters ---------- data_loader : torch.utils.data.DataLoader The dataloader of the evaluation set model : torch.nn.module The network model being used criterion: torch.nn.loss The loss function used to compute the loss of the model writer : tensorboardX.writer.SummaryWriter The tensorboard writer object. Used to log values on file for the tensorboard visualization. epoch : int Number of the epoch (for logging purposes) logging_label : string Label for logging purposes. Typically 'test' or 'valid'. Its prepended to the logging output path and messages. no_cuda : boolean Specifies whether the GPU should be used or not. A value of 'True' means the CPU will be used. log_interval : int Interval limiting the logging of mini-batches. Default value of 10. Returns ------- top1.avg : float Accuracy of the model of the evaluated split """ multi_run = kwargs['run'] if 'run' in kwargs else None # Instantiate the counters batch_time = AverageMeter() losses = AverageMeter() top1 = AverageMeter() data_time = AverageMeter() # Switch to evaluate mode (turn off dropout & such ) model.eval() # Iterate over whole evaluation set end = time.time() # Empty lists to store the predictions and target values preds = [] targets = [] multi_run = False pbar = tqdm(enumerate(data_loader), total=len(data_loader), unit='batch', ncols=150, leave=False) for batch_idx, (input, target) in pbar: # todo: how to you implement sliding window accross batches if len(input.size()) == 5: multi_run = True # input [64, 5, 3, 299, 299] bs, ncrops, c, h, w = input.size() # input.view leaves the 3rd 4th and 5th dimension as is, but multiplies the 1st and 2nd together # result [320, 3, 299, 299] # result = input.view(-1, c, h, w) # fuse batch size and ncrops # result_avg = input.view(bs, -1, c, h, w).mean(1) input = input.view(-1, c, h, w) # If you are using tensor.max(1) then you get a tupel with two tensors, choose the first one # which is a floattensor and what you need. # input = result_avg[0] # Measure data loading time data_time.update(time.time() - end) # Moving data to GPU if not no_cuda: input = input.cuda(async=True) target = target.cuda(async=True) # Convert the input and its labels to Torch Variables # todo: check them out in debugger input_var = torch.autograd.Variable(input, volatile=True) target_var = torch.autograd.Variable(target, volatile=True) # Compute output output = model(input_var) if multi_run: output = output.view(bs, ncrops, -1).mean(1) # Compute and record the loss loss = criterion(output, target_var) losses.update(loss.data[0], input.size(0)) # Compute and record the accuracy acc1 = accuracy(output.data, target, topk=(1, ))[0] top1.update(acc1[0], input.size(0)) # Get the predictions _ = [ preds.append(item) for item in [np.argmax(item) for item in output.data.cpu().numpy()] ] _ = [targets.append(item) for item in target.cpu().numpy()] # Add loss and accuracy to Tensorboard if multi_run is None: writer.add_scalar(logging_label + '/mb_loss', loss.data[0], epoch * len(data_loader) + batch_idx) writer.add_scalar(logging_label + '/mb_accuracy', acc1.cpu().numpy(), epoch * len(data_loader) + batch_idx) else: writer.add_scalar(logging_label + '/mb_loss_{}'.format(multi_run), loss.data[0], epoch * len(data_loader) + batch_idx) writer.add_scalar( logging_label + '/mb_accuracy_{}'.format(multi_run), acc1.cpu().numpy(), epoch * len(data_loader) + batch_idx) # Measure elapsed time batch_time.update(time.time() - end) end = time.time() if batch_idx % log_interval == 0: pbar.set_description(logging_label + ' epoch [{0}][{1}/{2}]\t'.format( epoch, batch_idx, len(data_loader))) pbar.set_postfix( Time='{batch_time.avg:.3f}\t'.format(batch_time=batch_time), Loss='{loss.avg:.4f}\t'.format(loss=losses), Acc1='{top1.avg:.3f}\t'.format(top1=top1), Data='{data_time.avg:.3f}\t'.format(data_time=data_time)) # Make a confusion matrix try: cm = confusion_matrix(y_true=targets, y_pred=preds) confusion_matrix_heatmap = make_heatmap(cm, data_loader.dataset.classes) except ValueError: logging.warning('Confusion Matrix did not work as expected') confusion_matrix_heatmap = np.zeros((10, 10, 3)) # Logging the epoch-wise accuracy and confusion matrix if multi_run is None: writer.add_scalar(logging_label + '/accuracy', top1.avg, epoch) # ERROR: save_image_and_log_tensorboard() got an unexpected keyword argument 'image_tensore' # changed 'image_tensor=confusion_matrix_heattmap' to 'image=confusion_mastrix_heatmap' save_image_and_log_to_tensorboard(writer, tag=logging_label + '/confusion_matrix', image=confusion_matrix_heatmap, global_step=epoch) else: writer.add_scalar(logging_label + '/accuracy_{}'.format(multi_run), top1.avg, epoch) save_image_and_log_to_tensorboard( writer, tag=logging_label + '/confusion_matrix_{}'.format(multi_run), image=confusion_matrix_heatmap, global_step=epoch) logging.info( _prettyprint_logging_label(logging_label) + ' epoch[{}]: ' 'Acc@1={top1.avg:.3f}\t' 'Loss={loss.avg:.4f}\t' 'Batch time={batch_time.avg:.3f} ({data_time.avg:.3f} to load data)'. format(epoch, batch_time=batch_time, data_time=data_time, loss=losses, top1=top1)) # Generate a classification report for each epoch _log_classification_report(data_loader, epoch, preds, targets, writer) return top1.avg
def train(model, train_loader, criterion, scheduler, optimizer, epoch, logger): global train_step losses = AverageMeter() batch_time = AverageMeter() data_time = AverageMeter() end = time.time() model.train() # scheduler.step() print('Epoch: {} : LR = {}'.format(epoch, scheduler.get_lr())) for i, (img, train_mask, tr_mask, tcl_mask, radius_map, sin_map, cos_map, gt_roi) in enumerate(train_loader): data_time.update(time.time() - end) train_step += 1 img, train_mask, tr_mask, tcl_mask, radius_map, sin_map, cos_map \ = to_device(img, train_mask, tr_mask, tcl_mask, radius_map, sin_map, cos_map) output, gcn_data = model(img, gt_roi, to_device) tr_loss, tcl_loss, sin_loss, cos_loss, radii_loss, gcn_loss \ = criterion(output, gcn_data, train_mask, tr_mask, tcl_mask, radius_map, sin_map, cos_map) loss = tr_loss + tcl_loss + sin_loss + cos_loss + radii_loss + gcn_loss # backward try: optimizer.zero_grad() loss.backward() except: print("loss gg") continue optimizer.step() losses.update(loss.item()) # measure elapsed time batch_time.update(time.time() - end) end = time.time() gc.collect() if cfg.viz and i % cfg.viz_freq == 0: visualize_network_output(output, tr_mask, tcl_mask[:, :, :, 0], mode='train') if i % cfg.display_freq == 0: print( '({:d} / {:d}) Loss: {:.4f} tr_loss: {:.4f} tcl_loss: {:.4f} ' 'sin_loss: {:.4f} cos_loss: {:.4f} radii_loss: {:.4f} gcn_loss: {:.4f}' .format(i, len(train_loader), loss.item(), tr_loss.item(), tcl_loss.item(), sin_loss.item(), cos_loss.item(), radii_loss.item(), gcn_loss.item())) if i % cfg.log_freq == 0: logger.write_scalars( { 'loss': loss.item(), 'tr_loss': tr_loss.item(), 'tcl_loss': tcl_loss.item(), 'sin_loss': sin_loss.item(), 'cos_loss': cos_loss.item(), 'radii_loss': radii_loss.item(), 'gcn_loss:': gcn_loss.item() }, tag='train', n_iter=train_step) if epoch % cfg.save_freq == 0: save_model(model, epoch, scheduler.get_lr(), optimizer) print('Training Loss: {}'.format(losses.avg))
def train(train_loader, model, criterion, optimizer, epoch, tflogger): model.train() #taglist = ['module.conv1.weight','module.bn1.weight','module.bn1.bias','module.conv2.weight','module.conv2.bias','module.bn2.weight','module.bn2.bias','module.conv3.weight','module.conv3.bias','module.bn3.weight','module.bn3.bia','module.conv4.weight','module.conv4.bias'] batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() running_metric_text = runningScore(2) running_metric_kernel = runningScore(2) global globalcounter end = time.time() for batch_idx, (imgs, gt_texts, gt_kernels, training_masks) in enumerate(train_loader): data_time.update(time.time() - end) imgs = Variable(imgs.cuda()) gt_texts = Variable(gt_texts.cuda()) gt_kernels = Variable(gt_kernels.cuda()) training_masks = Variable(training_masks.cuda()) outputs = model(imgs) texts = outputs[:, 0, :, :] kernels = outputs[:, 1:, :, :] selected_masks = ohem_batch(texts, gt_texts, training_masks) selected_masks = Variable(selected_masks.cuda()) loss_text = criterion(texts, gt_texts, selected_masks) loss_kernels = [] mask0 = torch.sigmoid(texts).data.cpu().numpy() mask1 = training_masks.data.cpu().numpy() selected_masks = ((mask0 > 0.5) & (mask1 > 0.5)).astype('float32') selected_masks = torch.from_numpy(selected_masks).float() selected_masks = Variable(selected_masks.cuda()) for i in range(6): kernel_i = kernels[:, i, :, :] gt_kernel_i = gt_kernels[:, i, :, :] loss_kernel_i = criterion(kernel_i, gt_kernel_i, selected_masks) loss_kernels.append(loss_kernel_i) loss_kernel = sum(loss_kernels) / len(loss_kernels) loss = 0.7 * loss_text + 0.3 * loss_kernel losses.update(loss.item(), imgs.size(0)) optimizer.zero_grad() loss.backward() optimizer.step() score_text = cal_text_score(texts, gt_texts, training_masks, running_metric_text) score_kernel = cal_kernel_score(kernels, gt_kernels, gt_texts, training_masks, running_metric_kernel) batch_time.update(time.time() - end) end = time.time() if batch_idx % 20 == 0: output_log = '({batch}/{size}) Batch: {bt:.3f}s | TOTAL: {total:.0f}min | ETA: {eta:.0f}min | Loss: {loss:.4f} | Acc_t: {acc: .4f} | IOU_t: {iou_t: .4f} | IOU_k: {iou_k: .4f}'.format( batch=batch_idx + 1, size=len(train_loader), bt=batch_time.avg, total=batch_time.avg * batch_idx / 60.0, eta=batch_time.avg * (len(train_loader) - batch_idx) / 60.0, loss=losses.avg, acc=score_text['Mean Acc'], iou_t=score_text['Mean IoU'], iou_k=score_kernel['Mean IoU']) print(output_log) sys.stdout.flush() if batch_idx % 100 == 0: for tag, value in model.named_parameters(): tag = tag.replace('.', '/') tflogger.histo_summary(tag, value.data.detach().cpu().numpy(), globalcounter) tflogger.histo_summary(tag + '/grad', value.grad.data.detach().cpu().numpy(), globalcounter) globalcounter += 1 return (float(losses.avg), float(score_text['Mean Acc']), float(score_kernel['Mean Acc']), float(score_text['Mean IoU']), float(score_kernel['Mean IoU']))
def _evaluate(data_loader, model, criterion, observer, observer_criterion, writer, epoch, logging_label, no_cuda=False, log_interval=10, **kwargs): """ The evaluation routine Parameters ---------- :param data_loader : torch.utils.data.DataLoader The dataloader of the evaluation set :param model : torch.nn.module The network model being used :param criterion: torch.nn.loss The loss function used to compute the loss of the model :param writer : tensorboardX.writer.SummaryWriter The tensorboard writer object. Used to log values on file for the tensorboard visualization. :param epoch : int Number of the epoch (for logging purposes) :param logging_label : string Label for logging purposes. Typically 'test' or 'valid'. Its prepended to the logging output path and messages. :param no_cuda : boolean Specifies whether the GPU should be used or not. A value of 'True' means the CPU will be used. :param log_interval : int Interval limiting the logging of mini-batches. Default value of 10. :return: None """ multi_run = kwargs['run'] if 'run' in kwargs else None # Instantiate the counters batch_time = AverageMeter() losses = AverageMeter() observer_loss_meter = AverageMeter() top1 = AverageMeter() observer_acc_meter = AverageMeter() data_time = AverageMeter() # Switch to evaluate mode (turn off dropout & such ) model.eval() # Iterate over whole evaluation set end = time.time() # Empty lists to store the predictions and target values preds = [] targets = [] pbar = tqdm(enumerate(data_loader), total=len(data_loader), unit='batch', ncols=150, leave=False) for batch_idx, (input, target) in pbar: # Measure data loading time data_time.update(time.time() - end) # Moving data to GPU if not no_cuda: input = input.cuda(async=True) target = target.cuda(async=True) # Convert the input and its labels to Torch Variables input_var = torch.autograd.Variable(input, volatile=True) target_var = torch.autograd.Variable(target, volatile=True) # Compute output output = model(input_var) # Get the features from second last layer input_features_var = torch.autograd.Variable( model.module.features.data) # Use observer on the features observer_acc, observer_loss = evaluate_one_mini_batch( observer, observer_criterion, input_features_var, target_var, observer_loss_meter, observer_acc_meter) # Compute and record the loss loss = criterion(output, target_var) losses.update(loss.data[0], input.size(0)) # Compute and record the accuracy acc1 = accuracy(output.data, target, topk=(1, ))[0] top1.update(acc1[0], input.size(0)) # Get the predictions _ = [ preds.append(item) for item in [np.argmax(item) for item in output.data.cpu().numpy()] ] _ = [targets.append(item) for item in target.cpu().numpy()] # Add loss and accuracy to Tensorboard if multi_run is None: writer.add_scalar(logging_label + '/mb_loss', loss.data[0], epoch * len(data_loader) + batch_idx) writer.add_scalar(logging_label + '/mb_accuracy', acc1.cpu().numpy(), epoch * len(data_loader) + batch_idx) writer.add_scalar(logging_label + '/obs_mb_loss', observer_loss.data[0], epoch * len(data_loader) + batch_idx) writer.add_scalar(logging_label + '/obs_mb_accuracy', observer_acc.cpu().numpy(), epoch * len(data_loader) + batch_idx) else: writer.add_scalar(logging_label + '/mb_loss_{}'.format(multi_run), loss.data[0], epoch * len(data_loader) + batch_idx) writer.add_scalar( logging_label + '/mb_accuracy_{}'.format(multi_run), acc1.cpu().numpy(), epoch * len(data_loader) + batch_idx) writer.add_scalar( logging_label + '/obs_mb_loss_{}'.format(multi_run), observer_loss.data[0], epoch * len(data_loader) + batch_idx) writer.add_scalar( logging_label + '/obs_mb_accuracy_{}'.format(multi_run), observer_acc.cpu().numpy(), epoch * len(data_loader) + batch_idx) # Measure elapsed time batch_time.update(time.time() - end) end = time.time() if batch_idx % log_interval == 0: pbar.set_description(logging_label + ' epoch [{0}][{1}/{2}]\t'.format( epoch, batch_idx, len(data_loader))) pbar.set_postfix( Time='{batch_time.avg:.3f}\t'.format(batch_time=batch_time), Loss='{loss.avg:.4f}\t'.format(loss=losses), Acc1='{top1.avg:.3f}\t'.format(top1=top1), Data='{data_time.avg:.3f}\t'.format(data_time=data_time)) # Make a confusion matrix try: cm = confusion_matrix(y_true=targets, y_pred=preds) confusion_matrix_heatmap = make_heatmap(cm, data_loader.dataset.classes) except ValueError: logging.warning('Confusion Matrix did not work as expected') confusion_matrix_heatmap = np.zeros((10, 10, 3)) # Logging the epoch-wise accuracy and confusion matrix if multi_run is None: writer.add_scalar(logging_label + '/accuracy', top1.avg, epoch) writer.add_scalar(logging_label + '/obs_accuracy', observer_acc_meter.avg, epoch) save_image_and_log_to_tensorboard( writer, tag=logging_label + '/confusion_matrix', image_tensor=confusion_matrix_heatmap, global_step=epoch) else: writer.add_scalar(logging_label + '/accuracy_{}'.format(multi_run), top1.avg, epoch) writer.add_scalar(logging_label + '/obs_accuracy_{}'.format(multi_run), observer_acc_meter.avg, epoch) save_image_and_log_to_tensorboard( writer, tag=logging_label + '/confusion_matrix_{}'.format(multi_run), image_tensor=confusion_matrix_heatmap, global_step=epoch) logging.info( _prettyprint_logging_label(logging_label) + ' epoch[{}]: ' 'Acc@1={top1.avg:.3f}\t' 'Loss={loss.avg:.4f}\t' 'Batch time={batch_time.avg:.3f} ({data_time.avg:.3f} to load data)'. format(epoch, batch_time=batch_time, data_time=data_time, loss=losses, top1=top1)) # Generate a classification report for each epoch _log_classification_report(data_loader, epoch, preds, targets, writer) return top1.avg