Ejemplo n.º 1
0
def train_one_epoch(task_model, task_optimizer, data_loader, device, cycle, epoch, print_freq):
    task_model.train()
    metric_logger = utils.MetricLogger(delimiter="  ")
    metric_logger.add_meter('task_lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
    header = 'Cycle:[{}] Epoch: [{}]'.format(cycle, epoch)

    task_lr_scheduler = None

    if epoch == 0:
        warmup_factor = 1. / 1000
        warmup_iters = min(1000, len(data_loader) - 1)

        task_lr_scheduler = utils.warmup_lr_scheduler(task_optimizer, warmup_iters, warmup_factor)
    for images, targets in metric_logger.log_every(data_loader, print_freq, header):
        images = list(image.to(device) for image in images)
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
        task_loss_dict = task_model(images, targets)
        task_losses = sum(loss for loss in task_loss_dict.values())
        # reduce losses over all GPUs for logging purposes
        task_loss_dict_reduced = utils.reduce_dict(task_loss_dict)
        task_losses_reduced = sum(loss.cpu() for loss in task_loss_dict_reduced.values())
        task_loss_value = task_losses_reduced.item()
        losses = task_losses
        if not math.isfinite(task_loss_value):
            print("Loss is {}, stopping training".format(task_loss_value))
            sys.exit(1)

        task_optimizer.zero_grad()
        losses.backward()
        task_optimizer.step()
        if task_lr_scheduler is not None:
            task_lr_scheduler.step()
        metric_logger.update(task_loss=task_losses_reduced)
        metric_logger.update(task_lr=task_optimizer.param_groups[0]["lr"])
    return metric_logger
Ejemplo n.º 2
0
def train_one_epoch(model,
                    optimizer,
                    data_loader,
                    device,
                    epoch,
                    print_freq=10,
                    writer=None):
    global global_step
    model.train()

    metric_logger = utils.MetricLogger(delimiter="  ")
    metric_logger.add_meter(
        'lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
    header = 'Epoch: [{}]'.format(epoch)

    lr_scheduler = None
    if epoch == 0:
        warmup_factor = 1. / 500
        warmup_iters = min(500, len(data_loader) - 1)
        lr_scheduler = warmup_lr_scheduler(optimizer, warmup_iters,
                                           warmup_factor)

    for images, img_metas, targets in metric_logger.log_every(
            data_loader, print_freq, header):
        global_step += 1
        images = images.to(device)
        targets = [t.to(device) for t in targets]

        loss_dict, _ = model(images, img_metas, targets)
        losses = sum(list(loss_dict.values()))

        loss_dict_reduced = dist_utils.reduce_dict(loss_dict)
        losses_reduced = sum(loss for loss in loss_dict_reduced.values())

        optimizer.zero_grad()
        losses.backward()
        optimizer.step()

        if lr_scheduler is not None:
            lr_scheduler.step()
        metric_logger.update(loss=losses_reduced, **loss_dict_reduced)
        metric_logger.update(lr=optimizer.param_groups[0]["lr"])

        if global_step % print_freq == 0:
            if writer:
                for k, v in loss_dict_reduced.items():
                    writer.add_scalar('losses/{}'.format(k),
                                      v,
                                      global_step=global_step)
                writer.add_scalar('losses/total_loss',
                                  losses_reduced,
                                  global_step=global_step)
                writer.add_scalar('lr',
                                  optimizer.param_groups[0]['lr'],
                                  global_step=global_step)
Ejemplo n.º 3
0
def train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq):
    model.train()
    metric_logger = utils.MetricLogger(delimiter="  ")
    metric_logger.add_meter(
        'lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
    header = 'Epoch: [{}]'.format(epoch)

    lr_scheduler = None
    if epoch == 0:
        warmup_factor = 1. / 1000
        warmup_iters = min(1000, len(data_loader) - 1)

        lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters,
                                                 warmup_factor)

    for images, targets in metric_logger.log_every(data_loader, print_freq,
                                                   header):
        images = list(image.to(device) for image in images)
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        loss_dict = model(images, targets)

        losses = sum(loss for loss in loss_dict.values())

        # reduce losses over all GPUs for logging purposes
        loss_dict_reduced = utils.reduce_dict(loss_dict)
        losses_reduced = sum(loss for loss in loss_dict_reduced.values())

        loss_value = losses_reduced.item()

        if not math.isfinite(loss_value):
            print("Loss is {}, stopping training".format(loss_value))
            print(loss_dict_reduced)
            sys.exit(1)

        optimizer.zero_grad()
        losses.backward()
        optimizer.step()

        if lr_scheduler is not None:
            lr_scheduler.step()

        metric_logger.update(loss=losses_reduced.detach().cpu(),
                             **loss_dict_reduced)
        metric_logger.update(lr=optimizer.param_groups[0]["lr"])
        del losses
        del loss_value
        del loss_dict_reduced
        del losses_reduced
        torch.cuda.empty_cache()
    return metric_logger
Ejemplo n.º 4
0
def train_one_epoch(task_model, task_optimizer, ll_model, ll_optimizer,
                    data_loader, device, cycle, epoch, print_freq):
    task_model.train()
    ll_model.train()
    metric_logger = utils.MetricLogger(delimiter="  ")
    metric_logger.add_meter(
        'task_lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
    metric_logger.add_meter(
        'll_lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
    header = 'Cycle:[{}] Epoch: [{}]'.format(cycle, epoch)

    task_lr_scheduler = None
    ll_lr_scheduler = None

    if epoch == 0:
        warmup_factor = 1. / 1000
        warmup_iters = min(1000, len(data_loader) - 1)

        task_lr_scheduler = utils.warmup_lr_scheduler(task_optimizer,
                                                      warmup_iters,
                                                      warmup_factor)
        ll_lr_scheduler = utils.warmup_lr_scheduler(ll_optimizer, warmup_iters,
                                                    warmup_factor)

    for images, targets in metric_logger.log_every(data_loader, print_freq,
                                                   header):
        images = list(image.to(device) for image in images)
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        features, task_loss_dict = task_model(images, targets)
        if 'faster' in args.model:
            _task_losses = sum(loss for loss in task_loss_dict.values())
            # print(_task_losses)
            task_loss_dict['loss_objectness'] = torch.mean(
                task_loss_dict['loss_objectness'])
            task_loss_dict['loss_rpn_box_reg'] = torch.mean(
                task_loss_dict['loss_rpn_box_reg'])
            task_loss_dict['loss_classifier'] = torch.mean(
                task_loss_dict['loss_classifier'])
            task_loss_dict['loss_box_reg'] = torch.mean(
                task_loss_dict['loss_box_reg'])
            task_losses = sum(loss for loss in task_loss_dict.values())
            # reduce losses over all GPUs for logging purposes
            task_loss_dict_reduced = utils.reduce_dict(task_loss_dict)
            task_losses_reduced = sum(
                loss.cpu() for loss in task_loss_dict_reduced.values())
            task_loss_value = task_losses_reduced.item()
            if epoch >= args.task_epochs:
                # After EPOCHL epochs, stop the gradient from the loss prediction module propagated to the target model.
                features['0'] = features['0'].detach()
                features['1'] = features['1'].detach()
                features['2'] = features['2'].detach()
                features['3'] = features['3'].detach()
            ll_pred = ll_model(features).cuda()
        elif 'retina' in args.model:
            _task_losses = sum(
                torch.stack(loss[1]) for loss in task_loss_dict.values())
            task_loss_dict['classification'] = task_loss_dict[
                'classification'][0]
            task_loss_dict['bbox_regression'] = task_loss_dict[
                'bbox_regression'][0]
            # for loss in task_loss_dict.values():
            #     print(loss)
            task_losses = sum(loss for loss in task_loss_dict.values())
            task_loss_dict_reduced = utils.reduce_dict(task_loss_dict)
            task_losses_reduced = sum(
                loss.cpu() for loss in task_loss_dict_reduced.values())
            task_loss_value = task_losses_reduced.item()
            if epoch >= args.task_epochs:
                # After EPOCHL epochs, stop the gradient from the loss prediction module propagated to the target model.
                _features = dict()
                _features['0'] = features[0].detach()
                _features['1'] = features[1].detach()
                _features['2'] = features[2].detach()
                _features['3'] = features[3].detach()
            else:
                _features = dict()
                _features['0'] = features[0]
                _features['1'] = features[1]
                _features['2'] = features[2]
                _features['3'] = features[3]
            ll_pred = ll_model(_features).cuda()
        ll_pred = ll_pred.view(ll_pred.size(0))
        ll_loss = args.ll_weight * LossPredLoss(
            ll_pred, _task_losses, margin=MARGIN)
        losses = task_losses + ll_loss
        if not math.isfinite(task_loss_value):
            print("Loss is {}, stopping training".format(task_loss_value))
            print(task_loss_dict_reduced)
            sys.exit(1)

        task_optimizer.zero_grad()
        ll_optimizer.zero_grad()
        losses.backward()
        task_optimizer.step()
        ll_optimizer.step()
        if task_lr_scheduler is not None:
            task_lr_scheduler.step()
        if ll_lr_scheduler is not None:
            ll_lr_scheduler.step()
        metric_logger.update(task_loss=task_losses_reduced)
        metric_logger.update(task_lr=task_optimizer.param_groups[0]["lr"])
        metric_logger.update(ll_loss=ll_loss.item())
        metric_logger.update(ll_lr=ll_optimizer.param_groups[0]["lr"])
    return metric_logger
Ejemplo n.º 5
0
def train_one_epoch(task_model, task_optimizer, vae, vae_optimizer,
                    discriminator, discriminator_optimizer, labeled_dataloader,
                    unlabeled_dataloader, device, cycle, epoch, print_freq):
    def read_unlabeled_data(dataloader):
        while True:
            for images, _ in dataloader:
                yield list(image.to(device) for image in images)

    labeled_data = read_unlabeled_data(labeled_dataloader)
    unlabeled_data = read_unlabeled_data(unlabeled_dataloader)
    task_model.train()
    vae.train()
    discriminator.train()
    metric_logger = utils.MetricLogger(delimiter="  ")
    metric_logger.add_meter(
        'task_lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
    header = 'Cycle:[{}] Epoch: [{}]'.format(cycle, epoch)

    task_lr_scheduler = None
    vae_lr_scheduler = None
    discriminator_lr_scheduler = None
    if epoch == 0:
        warmup_factor = 1. / 1000
        warmup_iters = min(1000, len(labeled_dataloader) - 1)

        task_lr_scheduler = utils.warmup_lr_scheduler(task_optimizer,
                                                      warmup_iters,
                                                      warmup_factor)
        vae_lr_scheduler = utils.warmup_lr_scheduler(vae_optimizer,
                                                     warmup_iters,
                                                     warmup_factor)
        discriminator_lr_scheduler = utils.warmup_lr_scheduler(
            discriminator_optimizer, warmup_iters, warmup_factor)

    for images, targets in metric_logger.log_every(labeled_dataloader,
                                                   print_freq, header):
        images = list(image.to(device) for image in images)
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        task_loss_dict = task_model(images, targets)
        task_losses = sum(loss for loss in task_loss_dict.values())
        # reduce losses over all GPUs for logging purposes
        task_loss_dict_reduced = utils.reduce_dict(task_loss_dict)
        task_losses_reduced = sum(loss.cpu()
                                  for loss in task_loss_dict_reduced.values())
        task_loss_value = task_losses_reduced.item()
        losses = task_losses
        if not math.isfinite(task_loss_value):
            print("Loss is {}, stopping training".format(task_loss_value))
            print(task_loss_dict_reduced)
            sys.exit(1)
        task_optimizer.zero_grad()
        losses.backward()
        task_optimizer.step()
        if task_lr_scheduler is not None:
            task_lr_scheduler.step()
        metric_logger.update(task_loss=task_losses_reduced)
        metric_logger.update(task_lr=task_optimizer.param_groups[0]["lr"])

    for i in range(len(labeled_dataloader)):
        unlabeled_imgs = next(unlabeled_data)
        labeled_imgs = next(labeled_data)
        recon, z, mu, logvar = vae(labeled_imgs)
        unsup_loss = vae_loss(labeled_imgs, recon, mu, logvar, 1)
        unlab_recon, unlab_z, unlab_mu, unlab_logvar = vae(unlabeled_imgs)
        transductive_loss = vae_loss(unlabeled_imgs, unlab_recon, unlab_mu,
                                     unlab_logvar, 1)

        labeled_preds = discriminator(mu)
        unlabeled_preds = discriminator(unlab_mu)

        lab_real_preds = torch.ones(len(labeled_imgs)).cuda()
        unlab_real_preds = torch.ones(len(unlabeled_imgs)).cuda()

        if not len(labeled_preds.shape) == len(lab_real_preds.shape):
            dsc_loss = bce_loss(
                labeled_preds, lab_real_preds.unsqueeze(1)) + bce_loss(
                    unlabeled_preds, unlab_real_preds.unsqueeze(1))
        else:
            dsc_loss = bce_loss(labeled_preds, lab_real_preds) + bce_loss(
                unlabeled_preds, unlab_real_preds)
        total_vae_loss = unsup_loss + transductive_loss + dsc_loss
        vae_optimizer.zero_grad()
        total_vae_loss.backward()
        vae_optimizer.step()

        # Discriminator step
        with torch.no_grad():
            _, _, mu, _ = vae(labeled_imgs)
            _, _, unlab_mu, _ = vae(unlabeled_imgs)

        labeled_preds = discriminator(mu)
        unlabeled_preds = discriminator(unlab_mu)

        lab_real_preds = torch.ones(len(labeled_imgs)).cuda()
        unlab_fake_preds = torch.zeros(len(unlabeled_imgs)).cuda()

        if not len(labeled_preds.shape) == len(lab_real_preds.shape):
            dsc_loss = bce_loss(
                labeled_preds, lab_real_preds.unsqueeze(1)) + bce_loss(
                    unlabeled_preds, unlab_fake_preds.unsqueeze(1))
        else:
            dsc_loss = bce_loss(labeled_preds, lab_real_preds) + bce_loss(
                unlabeled_preds, unlab_fake_preds)
        discriminator_optimizer.zero_grad()
        dsc_loss.backward()
        discriminator_optimizer.step()

        if vae_lr_scheduler is not None:
            vae_lr_scheduler.step()
        if discriminator_lr_scheduler is not None:
            discriminator_lr_scheduler.step()
        if i == len(labeled_dataloader) - 1:
            print('vae_loss: {} dis_loss:{}'.format(total_vae_loss, dsc_loss))

    return metric_logger
Ejemplo n.º 6
0
def train_one_epoch(model,
                    optimizer,
                    data_loader,
                    device,
                    epoch,
                    print_freq=10):
    model.train()
    metric_logger = utils.MetricLogger(delimiter="  ")
    metric_logger.add_meter(
        'lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
    header = 'Epoch: [{}]'.format(epoch)

    lr_scheduler = None
    if epoch == 0:
        warmup_factor = 1. / 1000
        warmup_iters = min(1000, len(data_loader) - 1)

        lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters,
                                                 warmup_factor)
    writer = SummaryWriter(log_dir=log_dir)
    i = 0
    for images, targets in metric_logger.log_every(data_loader, print_freq,
                                                   header):
        if epoch == 27:
            print('')
        images = list(image.to(device) for image in images)
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
        i += 1
        print("epoch {}/70, iteration{}/{}".format(epoch, i, len(data_loader)))

        loss_dict = model(images, targets)

        # losses = sum(loss for loss in loss_dict.values())

        # # reduce losses over all GPUs for logging purposes
        # loss_dict_reduced = utils.reduce_dict(loss_dict)
        # losses_reduced = sum(loss for loss in loss_dict_reduced.values())

        # loss_value = losses_reduced.item()

        # if not math.isfinite(loss_value):
        #     print("Loss is {}, stopping training".format(loss_value))
        #     print(loss_dict_reduced)
        #     sys.exit(1)

        # optimizer.zero_grad()
        # losses.backward()
        # optimizer.step()

        # if lr_scheduler is not None:
        #     lr_scheduler.step()
        # writer.add_scalars('train', {'loss_all': loss_value,
        #                              'loss_box_reg': loss_dict_reduced['loss_box_reg'].item(),
        #                              'loss_classifier': loss_dict_reduced['loss_classifier'].item(),
        #                              'loss_objectness': loss_dict_reduced['loss_objectness'].item(),
        #                              'loss_rpn_box_reg': loss_dict_reduced['loss_rpn_box_reg'].item()},
        #                    epoch*len(data_loader)+i)
        losses_reduced = torch.tensor([0.])
        loss_dict_reduced = {'1': 0.0}
        metric_logger.update(loss=losses_reduced, **loss_dict_reduced)
        metric_logger.update(lr=optimizer.param_groups[0]["lr"])
Ejemplo n.º 7
0
def train_one_epoch(model,
                    optimizer,
                    train_loader,
                    target_loader,
                    device,
                    epoch,
                    dis_model,
                    dis_optimizer,
                    print_freq=10,
                    writer=None,
                    test_func=None,
                    save_func=None):
    global global_step
    model.train()

    metric_logger = utils.MetricLogger(delimiter="  ")
    metric_logger.add_meter(
        'lr', utils.SmoothedValue(window_size=1, fmt='{value:.0e}'))
    metric_logger.add_meter(
        'LAMBDA', utils.SmoothedValue(window_size=1, fmt='{value:.3f}'))
    header = 'Epoch: [{}]'.format(epoch)

    lr_schedulers = []
    if epoch == 0:
        warmup_factor = 1. / 500
        warmup_iters = min(500, len(train_loader) - 1)
        # lr_schedulers = [
        #     warmup_lr_scheduler(optimizer, warmup_iters, warmup_factor),
        #     warmup_lr_scheduler(dis_optimizer, warmup_iters, warmup_factor),
        # ]

    target_loader_iter = iter(target_loader)
    for images, img_metas, targets in metric_logger.log_every(
            train_loader, print_freq, header):
        global_step += 1
        images = images.to(device)
        targets = [t.to(device) for t in targets]

        try:
            t_images, t_img_metas, _ = next(target_loader_iter)
        except StopIteration:
            target_loader_iter = iter(target_loader)
            t_images, t_img_metas, _ = next(target_loader_iter)

        t_images = t_images.to(device)

        loss_dict, outputs = model(images, img_metas, targets, t_images,
                                   t_img_metas)
        adv_loss = loss_dict.pop('adv_loss')
        loss_dict_for_log = dict(**loss_dict, **adv_loss)

        det_loss = sum(list(loss_dict.values()))
        ada_loss = sum(list(adv_loss.values()))

        LAMBDA = cosine_scheduler(cfg.ADV.LAMBDA_FROM, cfg.ADV.LAMBDA_TO,
                                  global_step)
        losses = det_loss + ada_loss * LAMBDA
        optimizer.zero_grad()
        losses.backward()
        optimizer.step()

        loss_dict_reduced = dist_utils.reduce_dict(loss_dict_for_log)
        losses_reduced = sum(loss for loss in loss_dict_reduced.values())

        for lr_scheduler in lr_schedulers:
            lr_scheduler.step()

        metric_logger.update(loss=losses_reduced, **loss_dict_reduced)
        metric_logger.update(lr=optimizer.param_groups[0]["lr"])
        metric_logger.update(LAMBDA=LAMBDA)

        if global_step % print_freq == 0:
            if writer:
                for k, v in loss_dict_reduced.items():
                    writer.add_scalar('losses/{}'.format(k),
                                      v,
                                      global_step=global_step)
                writer.add_scalar('losses/total_loss',
                                  losses_reduced,
                                  global_step=global_step)
                writer.add_scalar('lr',
                                  optimizer.param_groups[0]['lr'],
                                  global_step=global_step)
                writer.add_scalar('LAMBDA', LAMBDA, global_step=global_step)

        if global_step % (2000 // max(1, (dist_utils.get_world_size() // 2))
                          ) == 0 and test_func is not None:
            updated = test_func()
            if updated:
                save_func('best.pth', 'mAP: {:.4f}'.format(best_mAP))
            print('Best mAP: {:.4f}'.format(best_mAP))