def train_one_epoch(task_model, task_optimizer, data_loader, device, cycle, epoch, print_freq): task_model.train() metric_logger = utils.MetricLogger(delimiter=" ") metric_logger.add_meter('task_lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}')) header = 'Cycle:[{}] Epoch: [{}]'.format(cycle, epoch) task_lr_scheduler = None if epoch == 0: warmup_factor = 1. / 1000 warmup_iters = min(1000, len(data_loader) - 1) task_lr_scheduler = utils.warmup_lr_scheduler(task_optimizer, warmup_iters, warmup_factor) for images, targets in metric_logger.log_every(data_loader, print_freq, header): images = list(image.to(device) for image in images) targets = [{k: v.to(device) for k, v in t.items()} for t in targets] task_loss_dict = task_model(images, targets) task_losses = sum(loss for loss in task_loss_dict.values()) # reduce losses over all GPUs for logging purposes task_loss_dict_reduced = utils.reduce_dict(task_loss_dict) task_losses_reduced = sum(loss.cpu() for loss in task_loss_dict_reduced.values()) task_loss_value = task_losses_reduced.item() losses = task_losses if not math.isfinite(task_loss_value): print("Loss is {}, stopping training".format(task_loss_value)) sys.exit(1) task_optimizer.zero_grad() losses.backward() task_optimizer.step() if task_lr_scheduler is not None: task_lr_scheduler.step() metric_logger.update(task_loss=task_losses_reduced) metric_logger.update(task_lr=task_optimizer.param_groups[0]["lr"]) return metric_logger
def train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq=10, writer=None): global global_step model.train() metric_logger = utils.MetricLogger(delimiter=" ") metric_logger.add_meter( 'lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}')) header = 'Epoch: [{}]'.format(epoch) lr_scheduler = None if epoch == 0: warmup_factor = 1. / 500 warmup_iters = min(500, len(data_loader) - 1) lr_scheduler = warmup_lr_scheduler(optimizer, warmup_iters, warmup_factor) for images, img_metas, targets in metric_logger.log_every( data_loader, print_freq, header): global_step += 1 images = images.to(device) targets = [t.to(device) for t in targets] loss_dict, _ = model(images, img_metas, targets) losses = sum(list(loss_dict.values())) loss_dict_reduced = dist_utils.reduce_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) optimizer.zero_grad() losses.backward() optimizer.step() if lr_scheduler is not None: lr_scheduler.step() metric_logger.update(loss=losses_reduced, **loss_dict_reduced) metric_logger.update(lr=optimizer.param_groups[0]["lr"]) if global_step % print_freq == 0: if writer: for k, v in loss_dict_reduced.items(): writer.add_scalar('losses/{}'.format(k), v, global_step=global_step) writer.add_scalar('losses/total_loss', losses_reduced, global_step=global_step) writer.add_scalar('lr', optimizer.param_groups[0]['lr'], global_step=global_step)
def train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq): model.train() metric_logger = utils.MetricLogger(delimiter=" ") metric_logger.add_meter( 'lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}')) header = 'Epoch: [{}]'.format(epoch) lr_scheduler = None if epoch == 0: warmup_factor = 1. / 1000 warmup_iters = min(1000, len(data_loader) - 1) lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters, warmup_factor) for images, targets in metric_logger.log_every(data_loader, print_freq, header): images = list(image.to(device) for image in images) targets = [{k: v.to(device) for k, v in t.items()} for t in targets] loss_dict = model(images, targets) losses = sum(loss for loss in loss_dict.values()) # reduce losses over all GPUs for logging purposes loss_dict_reduced = utils.reduce_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) loss_value = losses_reduced.item() if not math.isfinite(loss_value): print("Loss is {}, stopping training".format(loss_value)) print(loss_dict_reduced) sys.exit(1) optimizer.zero_grad() losses.backward() optimizer.step() if lr_scheduler is not None: lr_scheduler.step() metric_logger.update(loss=losses_reduced.detach().cpu(), **loss_dict_reduced) metric_logger.update(lr=optimizer.param_groups[0]["lr"]) del losses del loss_value del loss_dict_reduced del losses_reduced torch.cuda.empty_cache() return metric_logger
def train_one_epoch(task_model, task_optimizer, ll_model, ll_optimizer, data_loader, device, cycle, epoch, print_freq): task_model.train() ll_model.train() metric_logger = utils.MetricLogger(delimiter=" ") metric_logger.add_meter( 'task_lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}')) metric_logger.add_meter( 'll_lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}')) header = 'Cycle:[{}] Epoch: [{}]'.format(cycle, epoch) task_lr_scheduler = None ll_lr_scheduler = None if epoch == 0: warmup_factor = 1. / 1000 warmup_iters = min(1000, len(data_loader) - 1) task_lr_scheduler = utils.warmup_lr_scheduler(task_optimizer, warmup_iters, warmup_factor) ll_lr_scheduler = utils.warmup_lr_scheduler(ll_optimizer, warmup_iters, warmup_factor) for images, targets in metric_logger.log_every(data_loader, print_freq, header): images = list(image.to(device) for image in images) targets = [{k: v.to(device) for k, v in t.items()} for t in targets] features, task_loss_dict = task_model(images, targets) if 'faster' in args.model: _task_losses = sum(loss for loss in task_loss_dict.values()) # print(_task_losses) task_loss_dict['loss_objectness'] = torch.mean( task_loss_dict['loss_objectness']) task_loss_dict['loss_rpn_box_reg'] = torch.mean( task_loss_dict['loss_rpn_box_reg']) task_loss_dict['loss_classifier'] = torch.mean( task_loss_dict['loss_classifier']) task_loss_dict['loss_box_reg'] = torch.mean( task_loss_dict['loss_box_reg']) task_losses = sum(loss for loss in task_loss_dict.values()) # reduce losses over all GPUs for logging purposes task_loss_dict_reduced = utils.reduce_dict(task_loss_dict) task_losses_reduced = sum( loss.cpu() for loss in task_loss_dict_reduced.values()) task_loss_value = task_losses_reduced.item() if epoch >= args.task_epochs: # After EPOCHL epochs, stop the gradient from the loss prediction module propagated to the target model. features['0'] = features['0'].detach() features['1'] = features['1'].detach() features['2'] = features['2'].detach() features['3'] = features['3'].detach() ll_pred = ll_model(features).cuda() elif 'retina' in args.model: _task_losses = sum( torch.stack(loss[1]) for loss in task_loss_dict.values()) task_loss_dict['classification'] = task_loss_dict[ 'classification'][0] task_loss_dict['bbox_regression'] = task_loss_dict[ 'bbox_regression'][0] # for loss in task_loss_dict.values(): # print(loss) task_losses = sum(loss for loss in task_loss_dict.values()) task_loss_dict_reduced = utils.reduce_dict(task_loss_dict) task_losses_reduced = sum( loss.cpu() for loss in task_loss_dict_reduced.values()) task_loss_value = task_losses_reduced.item() if epoch >= args.task_epochs: # After EPOCHL epochs, stop the gradient from the loss prediction module propagated to the target model. _features = dict() _features['0'] = features[0].detach() _features['1'] = features[1].detach() _features['2'] = features[2].detach() _features['3'] = features[3].detach() else: _features = dict() _features['0'] = features[0] _features['1'] = features[1] _features['2'] = features[2] _features['3'] = features[3] ll_pred = ll_model(_features).cuda() ll_pred = ll_pred.view(ll_pred.size(0)) ll_loss = args.ll_weight * LossPredLoss( ll_pred, _task_losses, margin=MARGIN) losses = task_losses + ll_loss if not math.isfinite(task_loss_value): print("Loss is {}, stopping training".format(task_loss_value)) print(task_loss_dict_reduced) sys.exit(1) task_optimizer.zero_grad() ll_optimizer.zero_grad() losses.backward() task_optimizer.step() ll_optimizer.step() if task_lr_scheduler is not None: task_lr_scheduler.step() if ll_lr_scheduler is not None: ll_lr_scheduler.step() metric_logger.update(task_loss=task_losses_reduced) metric_logger.update(task_lr=task_optimizer.param_groups[0]["lr"]) metric_logger.update(ll_loss=ll_loss.item()) metric_logger.update(ll_lr=ll_optimizer.param_groups[0]["lr"]) return metric_logger
def train_one_epoch(task_model, task_optimizer, vae, vae_optimizer, discriminator, discriminator_optimizer, labeled_dataloader, unlabeled_dataloader, device, cycle, epoch, print_freq): def read_unlabeled_data(dataloader): while True: for images, _ in dataloader: yield list(image.to(device) for image in images) labeled_data = read_unlabeled_data(labeled_dataloader) unlabeled_data = read_unlabeled_data(unlabeled_dataloader) task_model.train() vae.train() discriminator.train() metric_logger = utils.MetricLogger(delimiter=" ") metric_logger.add_meter( 'task_lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}')) header = 'Cycle:[{}] Epoch: [{}]'.format(cycle, epoch) task_lr_scheduler = None vae_lr_scheduler = None discriminator_lr_scheduler = None if epoch == 0: warmup_factor = 1. / 1000 warmup_iters = min(1000, len(labeled_dataloader) - 1) task_lr_scheduler = utils.warmup_lr_scheduler(task_optimizer, warmup_iters, warmup_factor) vae_lr_scheduler = utils.warmup_lr_scheduler(vae_optimizer, warmup_iters, warmup_factor) discriminator_lr_scheduler = utils.warmup_lr_scheduler( discriminator_optimizer, warmup_iters, warmup_factor) for images, targets in metric_logger.log_every(labeled_dataloader, print_freq, header): images = list(image.to(device) for image in images) targets = [{k: v.to(device) for k, v in t.items()} for t in targets] task_loss_dict = task_model(images, targets) task_losses = sum(loss for loss in task_loss_dict.values()) # reduce losses over all GPUs for logging purposes task_loss_dict_reduced = utils.reduce_dict(task_loss_dict) task_losses_reduced = sum(loss.cpu() for loss in task_loss_dict_reduced.values()) task_loss_value = task_losses_reduced.item() losses = task_losses if not math.isfinite(task_loss_value): print("Loss is {}, stopping training".format(task_loss_value)) print(task_loss_dict_reduced) sys.exit(1) task_optimizer.zero_grad() losses.backward() task_optimizer.step() if task_lr_scheduler is not None: task_lr_scheduler.step() metric_logger.update(task_loss=task_losses_reduced) metric_logger.update(task_lr=task_optimizer.param_groups[0]["lr"]) for i in range(len(labeled_dataloader)): unlabeled_imgs = next(unlabeled_data) labeled_imgs = next(labeled_data) recon, z, mu, logvar = vae(labeled_imgs) unsup_loss = vae_loss(labeled_imgs, recon, mu, logvar, 1) unlab_recon, unlab_z, unlab_mu, unlab_logvar = vae(unlabeled_imgs) transductive_loss = vae_loss(unlabeled_imgs, unlab_recon, unlab_mu, unlab_logvar, 1) labeled_preds = discriminator(mu) unlabeled_preds = discriminator(unlab_mu) lab_real_preds = torch.ones(len(labeled_imgs)).cuda() unlab_real_preds = torch.ones(len(unlabeled_imgs)).cuda() if not len(labeled_preds.shape) == len(lab_real_preds.shape): dsc_loss = bce_loss( labeled_preds, lab_real_preds.unsqueeze(1)) + bce_loss( unlabeled_preds, unlab_real_preds.unsqueeze(1)) else: dsc_loss = bce_loss(labeled_preds, lab_real_preds) + bce_loss( unlabeled_preds, unlab_real_preds) total_vae_loss = unsup_loss + transductive_loss + dsc_loss vae_optimizer.zero_grad() total_vae_loss.backward() vae_optimizer.step() # Discriminator step with torch.no_grad(): _, _, mu, _ = vae(labeled_imgs) _, _, unlab_mu, _ = vae(unlabeled_imgs) labeled_preds = discriminator(mu) unlabeled_preds = discriminator(unlab_mu) lab_real_preds = torch.ones(len(labeled_imgs)).cuda() unlab_fake_preds = torch.zeros(len(unlabeled_imgs)).cuda() if not len(labeled_preds.shape) == len(lab_real_preds.shape): dsc_loss = bce_loss( labeled_preds, lab_real_preds.unsqueeze(1)) + bce_loss( unlabeled_preds, unlab_fake_preds.unsqueeze(1)) else: dsc_loss = bce_loss(labeled_preds, lab_real_preds) + bce_loss( unlabeled_preds, unlab_fake_preds) discriminator_optimizer.zero_grad() dsc_loss.backward() discriminator_optimizer.step() if vae_lr_scheduler is not None: vae_lr_scheduler.step() if discriminator_lr_scheduler is not None: discriminator_lr_scheduler.step() if i == len(labeled_dataloader) - 1: print('vae_loss: {} dis_loss:{}'.format(total_vae_loss, dsc_loss)) return metric_logger
def train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq=10): model.train() metric_logger = utils.MetricLogger(delimiter=" ") metric_logger.add_meter( 'lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}')) header = 'Epoch: [{}]'.format(epoch) lr_scheduler = None if epoch == 0: warmup_factor = 1. / 1000 warmup_iters = min(1000, len(data_loader) - 1) lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters, warmup_factor) writer = SummaryWriter(log_dir=log_dir) i = 0 for images, targets in metric_logger.log_every(data_loader, print_freq, header): if epoch == 27: print('') images = list(image.to(device) for image in images) targets = [{k: v.to(device) for k, v in t.items()} for t in targets] i += 1 print("epoch {}/70, iteration{}/{}".format(epoch, i, len(data_loader))) loss_dict = model(images, targets) # losses = sum(loss for loss in loss_dict.values()) # # reduce losses over all GPUs for logging purposes # loss_dict_reduced = utils.reduce_dict(loss_dict) # losses_reduced = sum(loss for loss in loss_dict_reduced.values()) # loss_value = losses_reduced.item() # if not math.isfinite(loss_value): # print("Loss is {}, stopping training".format(loss_value)) # print(loss_dict_reduced) # sys.exit(1) # optimizer.zero_grad() # losses.backward() # optimizer.step() # if lr_scheduler is not None: # lr_scheduler.step() # writer.add_scalars('train', {'loss_all': loss_value, # 'loss_box_reg': loss_dict_reduced['loss_box_reg'].item(), # 'loss_classifier': loss_dict_reduced['loss_classifier'].item(), # 'loss_objectness': loss_dict_reduced['loss_objectness'].item(), # 'loss_rpn_box_reg': loss_dict_reduced['loss_rpn_box_reg'].item()}, # epoch*len(data_loader)+i) losses_reduced = torch.tensor([0.]) loss_dict_reduced = {'1': 0.0} metric_logger.update(loss=losses_reduced, **loss_dict_reduced) metric_logger.update(lr=optimizer.param_groups[0]["lr"])
def train_one_epoch(model, optimizer, train_loader, target_loader, device, epoch, dis_model, dis_optimizer, print_freq=10, writer=None, test_func=None, save_func=None): global global_step model.train() metric_logger = utils.MetricLogger(delimiter=" ") metric_logger.add_meter( 'lr', utils.SmoothedValue(window_size=1, fmt='{value:.0e}')) metric_logger.add_meter( 'LAMBDA', utils.SmoothedValue(window_size=1, fmt='{value:.3f}')) header = 'Epoch: [{}]'.format(epoch) lr_schedulers = [] if epoch == 0: warmup_factor = 1. / 500 warmup_iters = min(500, len(train_loader) - 1) # lr_schedulers = [ # warmup_lr_scheduler(optimizer, warmup_iters, warmup_factor), # warmup_lr_scheduler(dis_optimizer, warmup_iters, warmup_factor), # ] target_loader_iter = iter(target_loader) for images, img_metas, targets in metric_logger.log_every( train_loader, print_freq, header): global_step += 1 images = images.to(device) targets = [t.to(device) for t in targets] try: t_images, t_img_metas, _ = next(target_loader_iter) except StopIteration: target_loader_iter = iter(target_loader) t_images, t_img_metas, _ = next(target_loader_iter) t_images = t_images.to(device) loss_dict, outputs = model(images, img_metas, targets, t_images, t_img_metas) adv_loss = loss_dict.pop('adv_loss') loss_dict_for_log = dict(**loss_dict, **adv_loss) det_loss = sum(list(loss_dict.values())) ada_loss = sum(list(adv_loss.values())) LAMBDA = cosine_scheduler(cfg.ADV.LAMBDA_FROM, cfg.ADV.LAMBDA_TO, global_step) losses = det_loss + ada_loss * LAMBDA optimizer.zero_grad() losses.backward() optimizer.step() loss_dict_reduced = dist_utils.reduce_dict(loss_dict_for_log) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) for lr_scheduler in lr_schedulers: lr_scheduler.step() metric_logger.update(loss=losses_reduced, **loss_dict_reduced) metric_logger.update(lr=optimizer.param_groups[0]["lr"]) metric_logger.update(LAMBDA=LAMBDA) if global_step % print_freq == 0: if writer: for k, v in loss_dict_reduced.items(): writer.add_scalar('losses/{}'.format(k), v, global_step=global_step) writer.add_scalar('losses/total_loss', losses_reduced, global_step=global_step) writer.add_scalar('lr', optimizer.param_groups[0]['lr'], global_step=global_step) writer.add_scalar('LAMBDA', LAMBDA, global_step=global_step) if global_step % (2000 // max(1, (dist_utils.get_world_size() // 2)) ) == 0 and test_func is not None: updated = test_func() if updated: save_func('best.pth', 'mAP: {:.4f}'.format(best_mAP)) print('Best mAP: {:.4f}'.format(best_mAP))