Esempio n. 1
0
def train_one_epoch(model,
                    optimizer,
                    data_loader,
                    device,
                    epoch,
                    print_freq,
                    my_logger=None,
                    name=None,
                    env_name=None):
    model.train()
    metric_logger = utils.MetricLogger(delimiter="  ")
    metric_logger.add_meter(
        'lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
    header = 'Epoch: [{}]'.format(epoch)

    warmup_lr_scheduler = None
    if epoch == 0:
        warmup_factor = 1. / 1000
        warmup_iters = min(1000, len(data_loader) - 1)

        warmup_lr_scheduler = utils.warmup_lr_scheduler(
            optimizer, warmup_iters, warmup_factor)

    for images, targets, name in metric_logger.log_every(
            data_loader, print_freq, header):
        images = list(image.to(device) for image in images)
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        loss_dict = model(images, targets)

        losses = sum(loss for loss in loss_dict.values())

        # reduce losses over all GPUs for logging purposes
        loss_dict_reduced = utils.reduce_dict(loss_dict)
        losses_reduced = sum(loss for loss in loss_dict_reduced.values())

        loss_value = losses_reduced.item()

        if not math.isfinite(loss_value):
            print("Loss is {}, stopping training".format(loss_value))
            print(loss_dict_reduced)
            for img, t in zip(images, targets):
                print(img.shape)
                print(t)
                print(name)
            sys.exit(1)

        if my_logger:
            my_logger.scalar(loss_value,
                             env=env_name,
                             win="Loss",
                             trace=name,
                             xlabel="Iteration")

        optimizer.zero_grad()
        losses.backward()
        optimizer.step()

        if warmup_lr_scheduler:
            warmup_lr_scheduler.step()

        metric_logger.update(loss=losses_reduced, **loss_dict_reduced)
        metric_logger.update(lr=optimizer.param_groups[0]["lr"])
Esempio n. 2
0
def train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq, log_writer):
    model.train()
    metric_logger = utils.MetricLogger(delimiter="  ")
    metric_logger.add_meter('lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
    header = 'Epoch: [{}]'.format(epoch)

    # lr_scheduler = None
    milestones = [len(data_loader)//2]
    lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=milestones, gamma=0.8)
    # if epoch == 0:
    #     warmup_factor = 1. / 1000
    #     warmup_iters = min(1000, len(data_loader) - 1)
    #
    #     lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters, warmup_factor)

    count = 0
    for images, targets in metric_logger.log_every(data_loader, print_freq, header):
        count += 1
        images = list(image.to(device) for image in images)
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        loss_dict = model(images, targets)

        losses = sum(loss for loss in loss_dict.values())

        # reduce losses over all GPUs for logging purposes
        loss_dict_reduced = utils.reduce_dict(loss_dict)
        losses_reduced = sum(loss for loss in loss_dict_reduced.values())

        loss_value = losses_reduced.item()

        if not math.isfinite(loss_value):
            print("count {}".format(count))
            print(">>>>>>>>>>>>>>>>>> bboxes")
            print(targets[0]["boxes"])
            print(">>>>>>>>>>>>>>>>>> labels")
            print(targets[0]["labels"])
            print(">>>>>>>>>>>>>>>>>> image_id")
            print(targets[0]["image_id"])
            print(">>>>>>>>>>>>>>>>>> area")
            print(targets[0]["area"])
            print("Loss is {}, stopping training".format(loss_value))
            print(loss_dict_reduced)
            sys.exit(1)

        optimizer.zero_grad()
        losses.backward()
        optimizer.step()

        if lr_scheduler is not None:
            lr_scheduler.step()

        metric_logger.update(loss=losses_reduced, **loss_dict_reduced)
        metric_logger.update(lr=optimizer.param_groups[0]["lr"])

        # ================================================================== #
        #                        Tensorboard Logging                         #
        # ================================================================== #
        if count % 100 == 0:
            n_iter = count + epoch * len(data_loader) / len(images)
            log_writer.add_scalar('Loss/total', loss_value, n_iter/100)
            log_writer.add_scalar('Loss/class', loss_dict['loss_classifier'], n_iter/100)
            log_writer.add_scalar('Loss/bbox', loss_dict['loss_box_reg'], n_iter/100)
            log_writer.add_scalar('Loss/mask', loss_dict['loss_mask'], n_iter/100)
            log_writer.add_scalar('Loss/objectness', loss_dict['loss_objectness'], n_iter/100)
            log_writer.add_scalar('Loss/rpn_box', loss_dict['loss_rpn_box_reg'], n_iter/100)
def train_one_epoch(model,
                    optimizer,
                    lr_scheduler,
                    data_loader,
                    device,
                    epoch,
                    print_freq,
                    vis=None,
                    checkpoint_fn=None,
                    prob=None):

    model.train()
    metric_logger = utils.MetricLogger(delimiter="  ")
    metric_logger.add_meter('lr',
                            utils.SmoothedValue(window_size=1, fmt='{value}'))
    metric_logger.add_meter(
        'clips/s', utils.SmoothedValue(window_size=10, fmt='{value:.3f}'))

    header = f'Epoch: [{epoch}]'

    # Initialise wandb
    if vis is not None:
        vis.wandb_init(model)

    for step, ((video, orig, orig_unnorm), sp_mask) in enumerate(
            metric_logger.log_every(data_loader, print_freq, header)):
        start_time = time.time()

        grid = np.random.choice([True, False], p=[prob, 1 - prob])

        if grid:
            video = video.to(device)
            output, loss, diagnostics = model(
                video, None, None,
                orig_unnorm=None) if not args.teacher_student else model(video)
        else:
            sp_mask = sp_mask.to(device)
            orig = orig.to(device)
            max_sp_num = len(torch.unique(sp_mask))
            output, loss, diagnostics = model(orig,
                                              sp_mask,
                                              max_sp_num,
                                              orig_unnorm=orig_unnorm)

        loss = loss.mean()

        # if vis is not None and np.random.random() < 0.01:
        if vis is not None:
            vis.log(dict(loss=loss.mean().item()))
            vis.log({k: v.mean().item() for k, v in diagnostics.items()})

        # NOTE Stochastic checkpointing has been retained
        if checkpoint_fn is not None and np.random.random() < 0.005:
            checkpoint_fn()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        metric_logger.update(loss=loss.item(),
                             lr=optimizer.param_groups[0]["lr"])
        metric_logger.meters['clips/s'].update(video.shape[0] /
                                               (time.time() - start_time))
        lr_scheduler.step()

        # Change Compactness During The Epoch
        # if step > len(data_loader)//2 and epoch < 15:
        #     compactness = data_loader.dataset.get_compactness()
        #     data_loader.dataset.set_compactness(compactness - 10)

    checkpoint_fn()
def train(batch_size, checkpoint_freq, num_epochs):

    num_classes = 2
    model = torchvision.models.detection.maskrcnn_resnet50_fpn(
        pretrained=True, rpn_nms_thresh=1, rpn_pre_nms_top_n_train=5000)

    in_features = model.roi_heads.box_predictor.cls_score.in_features
    model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

    in_features_mask = model.roi_heads.mask_predictor.conv5_mask.in_channels
    hidden_layer = 256

    model.roi_heads.mask_predictor = MaskRCNNPredictor(in_features_mask,
                                                       hidden_layer,
                                                       num_classes)
    model = torch.nn.DataParallel(model)
    model.to('cuda')

    dataset = PennFudanDataset('PennFudanPed', get_transform(train=True))
    dataset_test = PennFudanDataset('PennFudanPed', get_transform(train=False))

    indices = torch.randperm(len(dataset)).tolist()
    dataset = torch.utils.data.Subset(dataset, indices[:-50])
    dataset_test = torch.utils.data.Subset(dataset_test, indices[-50:])

    data_loader = torch.utils.data.DataLoader(dataset,
                                              batch_size=batch_size,
                                              shuffle=True,
                                              num_workers=4,
                                              collate_fn=utils.collate_fn)

    data_loader_test = torch.utils.data.DataLoader(dataset_test,
                                                   batch_size=batch_size,
                                                   shuffle=False,
                                                   num_workers=4,
                                                   collate_fn=utils.collate_fn)

    params = [p for p in model.parameters() if p.requires_grad]
    optimizer = torch.optim.SGD(params,
                                lr=0.005,
                                momentum=0.9,
                                weight_decay=0.0005)

    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                                   step_size=3,
                                                   gamma=0.1)

    hook = smd.Hook.create_from_json_file()

    for epoch in range(num_epochs):

        hook.set_mode(modes.TRAIN)
        model.train()
        metric_logger = utils.MetricLogger(delimiter="  ")
        metric_logger.add_meter(
            'lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
        header = 'Epoch: [{}]'.format(epoch)

        if epoch == 0:
            warmup_factor = 1. / 1000
            warmup_iters = min(1000, len(data_loader) - 1)

            lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters,
                                                     warmup_factor)

        for iteration, (images, targets) in enumerate(data_loader):
            images = list(image.to('cuda') for image in images)
            targets = [{k: v.to('cuda')
                        for k, v in t.items()} for t in targets]

            loss_dict = model(images, targets)
            losses = sum(loss for loss in loss_dict.values())

            loss_dict_reduced = utils.reduce_dict(loss_dict)
            losses_reduced = sum(loss for loss in loss_dict_reduced.values())

            loss_value = losses_reduced.item()

            optimizer.zero_grad()
            losses.backward()

            optimizer.step()

            if lr_scheduler is not None:
                lr_scheduler.step()

            metric_logger.update(loss=losses_reduced, **loss_dict_reduced)
            metric_logger.update(lr=optimizer.param_groups[0]["lr"])

            if iteration % checkpoint_freq == 0:
                utils.save_on_master(
                    {
                        'model': model.state_dict(),
                        'optimizer': optimizer.state_dict()
                    }, 'model_{}.pth')

        lr_scheduler.step()

        hook.set_mode(modes.EVAL)
        evaluate(model, data_loader_test, device='cuda')
Esempio n. 5
0
def train_one_epoch(model,
                    optimizer,
                    data_loader,
                    device,
                    epoch,
                    print_freq,
                    writer=None):
    count = 0

    model.train()
    metric_logger = utils.MetricLogger(delimiter="  ")
    metric_logger.add_meter(
        'lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
    header = 'Epoch: [{}]'.format(epoch)

    lr_scheduler = None
    if epoch == 0:
        warmup_factor = 1. / 1000
        warmup_iters = min(1000, len(data_loader) - 1)

        lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters,
                                                 warmup_factor)

    for images, targets in metric_logger.log_every(data_loader, print_freq,
                                                   header):
        count += 1

        images = list(image.to(device) for image in images)
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        #print(len(targets))
        flag = 0
        for i in range(len(targets)):
            if len(targets[i]['boxes']) == 0:
                flag = 1
                break

        if flag is 1:
            continue

        loss_dict = model(images, targets)

        # losses = sum(loss for loss in loss_dict.values())
        losses = 0
        for i in loss_dict:
            if i == 'loss_keypoint':
                losses += loss_dict[i] * 0.5
            else:
                losses += loss_dict[i]

        # reduce losses over all GPUs for logging purposes
        loss_dict_reduced = utils.reduce_dict(loss_dict)
        losses_reduced = sum(loss for loss in loss_dict_reduced.values())

        loss_value = losses_reduced.item()

        if not math.isfinite(loss_value):
            print("Loss is {}, stopping training".format(loss_value))
            print(loss_dict_reduced)
            #           sys.exit(1)
            continue

        optimizer.zero_grad()
        losses.backward()
        optimizer.step()

        if lr_scheduler is not None:
            lr_scheduler.step()

        metric_logger.update(loss=losses_reduced, **loss_dict_reduced)
        metric_logger.update(lr=optimizer.param_groups[0]["lr"])

        if writer and count % 100 == 0:
            writer.add_scalar('loss_box_reg',
                              loss_dict_reduced['loss_box_reg'],
                              epoch * len(data_loader) + count)
            writer.add_scalar('loss_classifier',
                              loss_dict_reduced['loss_classifier'],
                              epoch * len(data_loader) + count)
            writer.add_scalar('loss_mask', loss_dict_reduced['loss_mask'],
                              epoch * len(data_loader) + count)
            writer.add_scalar('loss_keypoint',
                              loss_dict_reduced['loss_keypoint'],
                              epoch * len(data_loader) + count)
Esempio n. 6
0
File: engine.py Progetto: ofekp/imat
def train_one_epoch(model, optimizer, data_loader, device, epoch,
                    gradient_accumulation_steps, print_freq, box_threshold):
    model.train()
    metric_logger = utils.MetricLogger(delimiter="  ")
    metric_logger.add_meter(
        'lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
    header = 'Epoch: [{}]'.format(epoch)

    lr_scheduler = None
    if epoch == 0:
        warmup_factor = 1. / 1000
        warmup_iters = min(1000, len(data_loader) - 1)

        lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters,
                                                 warmup_factor)

    optimizer.zero_grad()  # gradient_accumulation
    steps = 0  # gradient_accumulation
    for images, targets in metric_logger.log_every(data_loader, print_freq,
                                                   header):
        # print("target: {}".format(targets))

        steps += 1  # gradient_accumulation
        images = list(image.to(device) for image in images)
        targets = [{
            k: v.to(device) if torch.is_tensor(v) else v
            for k, v in t.items()
        } for t in targets]

        if box_threshold is None:
            loss_dict = model(images, targets)
        else:
            loss_dict = model(images, box_threshold, targets)

        # print(loss_dict)
        losses = sum(loss / gradient_accumulation_steps
                     for loss in loss_dict.values())  # gradient_accumulation

        # reduce losses over all GPUs for logging purposes
        loss_dict_reduced = utils.reduce_dict(loss_dict)
        losses_reduced = sum(loss for loss in loss_dict_reduced.values())

        loss_value = losses_reduced.item()

        if not math.isfinite(loss_value):
            print("Loss is {}, stopping training".format(loss_value))
            print(loss_dict_reduced)
            sys.exit(1)

        #optimizer.zero_grad()
        losses.backward()

        # ofekp: we add grad clipping here to avoid instabilities in training
        torch.nn.utils.clip_grad_norm_(model.parameters(), 10.0)

        # gradient_accumulation
        if steps % gradient_accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()

        if lr_scheduler is not None:
            lr_scheduler.step()

        metric_logger.update(loss=losses_reduced, **loss_dict_reduced)
        metric_logger.update(lr=optimizer.param_groups[0]["lr"])

    return metric_logger
def train_one_epoch_FastRCNN(model,
                             optimizer,
                             data_loader,
                             device,
                             epoch,
                             print_freq,
                             mode="sew6",
                             encoder=None,
                             train_encoder=False):
    #this data loader is given loader
    #mode can be "sew6", "panorm", "autoencode"
    model.train()
    metric_logger = utils.MetricLogger(delimiter="  ")
    metric_logger.add_meter(
        'lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
    header = 'Epoch: [{}]'.format(epoch)

    lr_scheduler = None
    #     if epoch == 0:
    #         warmup_factor = 1. / 1000
    #         warmup_iters = min(1000, len(data_loader) - 1)

    #         lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters, warmup_factor)
    if mode == 'panorm':
        tt = transforms.Compose(
            [transforms.Resize((800, 800)),
             transforms.ToTensor(), normalize])  #this is for 6 images combo
    for sample, old_targets, road_image, extra in metric_logger.log_every(
            data_loader, print_freq, header):

        #images = sample[0]

        targets = trans_target(old_targets)
        #print("images len {}, targets len {}".format(len(images), len(targets)))
        #print("len(sample) {}, sample [0] shape {}".format(len(sample), sample[0].shape)) # [6, 3, 256, 306]
        #images = list(image.to(device) for image in images)
        if mode == "panorm":
            images = [
                tt(s).to(device)
                for s in sew_images_panorm(sample, to_img=True)
            ]

        elif mode == "autoencode":
            encoder.cuda()
            samp_pan = sew_images_panorm(sample)  #convert to panoramic tensor
            samp_pan = [normalize(i) for i in samp_pan]
            samp_pan_t = torch.stack(samp_pan, dim=0)  #stack
            if train_encoder:
                images = encoder.return_image_tensor(
                    samp_pan_t.to(device), train_encoder
                )  #see if it will take it or it needs to take a list
            else:
                images = encoder.return_image_tensor(samp_pan_t.cuda(),
                                                     train_encoder).to(device)

        else:  #mode is sew6
            images = [tt(sew_images(s)).to(device) for s in sample
                      ]  #list of [3, 800, 800], should be 1 per patch
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        loss_dict = model(images, targets)
        #print(loss_dict)

        losses = sum(loss for loss in loss_dict.values())

        # reduce losses over all GPUs for logging purposes
        loss_dict_reduced = utils.reduce_dict(loss_dict)
        losses_reduced = sum(loss for loss in loss_dict_reduced.values())

        loss_value = losses_reduced.item()

        if not math.isfinite(loss_value):
            print("Loss is {}, stopping training".format(loss_value))
            print(loss_dict_reduced)
            sys.exit(1)

        optimizer.zero_grad()
        losses.backward()
        optimizer.step()

        if lr_scheduler is not None:
            lr_scheduler.step()

        metric_logger.update(loss=losses_reduced, **loss_dict_reduced)
        metric_logger.update(lr=optimizer.param_groups[0]["lr"])

    return metric_logger
def train_one_epoch(model: torch.nn.Module,
                    criterion: DistillationLoss,
                    data_loader: Iterable,
                    optimizer: torch.optim.Optimizer,
                    device: torch.device,
                    epoch: int,
                    loss_scaler,
                    max_norm: float = 0,
                    model_ema: Optional[ModelEma] = None,
                    mixup_fn: Optional[Mixup] = None,
                    teacher=None,
                    set_training_mode=True):
    # TODO fix this for finetuning
    # model.train(set_training_mode)
    model.train()

    metric_logger = utils.MetricLogger(delimiter="  ")
    metric_logger.add_meter(
        'lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
    header = 'Epoch: [{}]'.format(epoch)
    print_freq = 100

    for samples, targets in metric_logger.log_every(data_loader, print_freq,
                                                    header):
        samples = samples.to(device, non_blocking=True)
        targets = targets.to(device, non_blocking=True)

        if mixup_fn is not None:
            samples, targets = mixup_fn(samples, targets)

        samples, targets, mix_rate, aux_targets = two_mix(
            samples, targets, num_patch=samples.shape[-1] // 16)

        with torch.cuda.amp.autocast():
            # outputs, r_loss = model(samples)
            outputs, r_loss, s_loss, proj = model(samples, aux_targets)
            loss = torch.sum(-targets * (1e-8 + outputs.softmax(dim=-1)).log(),
                             dim=-1).mean()

            loss_value = loss.item()
            loss += 1. * (r_loss + 1. * s_loss)

        if not math.isfinite(loss.item()):
            print("Loss is {}, stopping training".format(loss_value))
            sys.exit(1)

        optimizer.zero_grad()

        # this attribute is added by timm on one optimizer (adahessian)
        is_second_order = hasattr(
            optimizer, 'is_second_order') and optimizer.is_second_order
        loss_scaler(loss,
                    optimizer,
                    clip_grad=max_norm,
                    parameters=model.parameters(),
                    create_graph=is_second_order)

        torch.cuda.synchronize()
        if model_ema is not None:
            model_ema.update(model)

        metric_logger.update(loss=loss_value)
        metric_logger.update(lr=optimizer.param_groups[0]["lr"])
        metric_logger.meters['r'].update(r_loss.item(), n=targets.shape[0])
        # metric_logger.meters['p'].update(proj.item(), n=targets.shape[0])
        metric_logger.meters['s'].update(s_loss.item(), n=targets.shape[0])
        # metric_logger.meters['cos'].update(cos.item(), n=targets.shape[0])
    # gather the stats from all processes
    metric_logger.synchronize_between_processes()
    print("Averaged stats:", metric_logger)
    return {k: meter.global_avg for k, meter in metric_logger.meters.items()}
Esempio n. 9
0
def train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq,
                    writer, ckpt_path):
    model.train()
    metric_logger = utils.MetricLogger(delimiter="  ")
    metric_logger.add_meter(
        'lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
    header = 'Epoch: [{}]'.format(epoch)

    lr_scheduler = None
    if epoch == 0:
        warmup_factor = 1. / 1000
        warmup_iters = min(1000, len(data_loader) - 1)

        lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters,
                                                 warmup_factor)

    for images, targets in metric_logger.log_every(data_loader, print_freq,
                                                   header):
        images = list(image.to(device) for image in images)
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        loss_dict = model(images, targets)

        losses = sum(loss for loss in loss_dict.values())

        # reduce losses over all GPUs for logging purposes
        loss_dict_reduced = utils.reduce_dict(loss_dict)
        losses_reduced = sum(loss for loss in loss_dict_reduced.values())

        loss_value = losses_reduced.item()

        if not math.isfinite(loss_value):
            print("Loss is {}, stopping training".format(loss_value))
            print(loss_dict_reduced)
            sys.exit(1)

        optimizer.zero_grad()
        losses.backward()

        metric_logger.update(loss=losses_reduced, **loss_dict_reduced)
        metric_logger.update(lr=optimizer.param_groups[0]["lr"])

        for batch_idx, (images, targets) in enumerate(
                metric_logger.log_every(data_loader, print_freq, header)):
            writer.add_scalar('Training Loss', loss_value,
                              epoch * len(data_loader) + batch_idx)
            writer.add_scalar('loss_classifier',
                              loss_dict_reduced['loss_classifier'].item(),
                              epoch * len(data_loader) + batch_idx)
            writer.add_scalar('loss_box_reg',
                              loss_dict_reduced['loss_box_reg'].item(),
                              epoch * len(data_loader) + batch_idx)
            writer.add_scalar('loss_objectness',
                              loss_dict_reduced['loss_objectness'].item(),
                              epoch * len(data_loader) + batch_idx)
            writer.add_scalar('loss_rpn_box_reg',
                              loss_dict_reduced['loss_rpn_box_reg'].item(),
                              epoch * len(data_loader) + batch_idx)
            for name, param in model.named_parameters():
                if param.grad is not None:
                    param_norm = param.grad.data.norm(2).cpu().item()
                    writer.add_histogram(name + '_grad', param_norm, epoch)
                # else:
                #     print("{} has no grad".format(name))

        optimizer.step()

        if lr_scheduler is not None:
            lr_scheduler.step()

    #   Save model
    print("Saving model at training epoch: {}".format(epoch + 1))
    ckpt_dict = {
        'epoch': epoch + 1,
        'model': model.state_dict(),
        'optimizer': optimizer.state_dict()
    }
    torch.save(
        ckpt_dict,
        os.path.join(
            ckpt_path, 'ckpt_epoch-' + str(epoch + 1) + 'loss' +
            str(loss_value) + '.pth'))
Esempio n. 10
0
def train_one_epoch(model: torch.nn.Module,
                    d_vae: torch.nn.Module,
                    data_loader: Iterable,
                    optimizer: torch.optim.Optimizer,
                    device: torch.device,
                    epoch: int,
                    loss_scaler,
                    max_norm: float = 0,
                    log_writer=None,
                    lr_scheduler=None,
                    start_steps=None,
                    lr_schedule_values=None,
                    wd_schedule_values=None):
    model.train()
    metric_logger = utils.MetricLogger(delimiter="  ")
    metric_logger.add_meter(
        'lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
    metric_logger.add_meter(
        'min_lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
    header = 'Epoch: [{}]'.format(epoch)
    print_freq = 10

    for step, (batch, _) in enumerate(
            metric_logger.log_every(data_loader, print_freq, header)):
        # assign learning rate & weight decay for each step
        it = start_steps + step  # global training iteration
        if lr_schedule_values is not None or wd_schedule_values is not None:
            for i, param_group in enumerate(optimizer.param_groups):
                if lr_schedule_values is not None:
                    param_group["lr"] = lr_schedule_values[it] * param_group[
                        "lr_scale"]
                if wd_schedule_values is not None and param_group[
                        "weight_decay"] > 0:
                    param_group["weight_decay"] = wd_schedule_values[it]

        samples, images, bool_masked_pos = batch
        images = images.to(device, non_blocking=True)
        samples = samples.to(device, non_blocking=True)
        bool_masked_pos = bool_masked_pos.to(device, non_blocking=True)

        with torch.no_grad():
            input_ids = d_vae.get_codebook_indices(images).flatten(1)
            bool_masked_pos = bool_masked_pos.flatten(1).to(torch.bool)
            labels = input_ids[bool_masked_pos]

        with torch.cuda.amp.autocast():
            outputs = model(samples,
                            bool_masked_pos=bool_masked_pos,
                            return_all_tokens=False)
            loss = nn.CrossEntropyLoss()(input=outputs, target=labels)

        loss_value = loss.item()

        if not math.isfinite(loss_value):
            print("Loss is {}, stopping training".format(loss_value))
            sys.exit(1)

        optimizer.zero_grad()
        # this attribute is added by timm on one optimizer (adahessian)
        is_second_order = hasattr(
            optimizer, 'is_second_order') and optimizer.is_second_order
        grad_norm = loss_scaler(loss,
                                optimizer,
                                clip_grad=max_norm,
                                parameters=model.parameters(),
                                create_graph=is_second_order)
        loss_scale_value = loss_scaler.state_dict()["scale"]

        torch.cuda.synchronize()

        mlm_acc = (outputs.max(-1)[1] == labels).float().mean().item()

        metric_logger.update(mlm_acc=mlm_acc)
        if log_writer is not None:
            log_writer.update(mlm_acc=mlm_acc, head="loss")

        metric_logger.update(loss=loss_value)
        metric_logger.update(loss_scale=loss_scale_value)
        min_lr = 10.
        max_lr = 0.
        for group in optimizer.param_groups:
            min_lr = min(min_lr, group["lr"])
            max_lr = max(max_lr, group["lr"])

        metric_logger.update(lr=max_lr)
        metric_logger.update(min_lr=min_lr)
        weight_decay_value = None
        for group in optimizer.param_groups:
            if group["weight_decay"] > 0:
                weight_decay_value = group["weight_decay"]
        metric_logger.update(weight_decay=weight_decay_value)
        metric_logger.update(grad_norm=grad_norm)

        if log_writer is not None:
            log_writer.update(loss=loss_value, head="loss")
            log_writer.update(loss_scale=loss_scale_value, head="opt")
            log_writer.update(lr=max_lr, head="opt")
            log_writer.update(min_lr=min_lr, head="opt")
            log_writer.update(weight_decay=weight_decay_value, head="opt")
            log_writer.update(grad_norm=grad_norm, head="opt")

            log_writer.set_step()

        if lr_scheduler is not None:
            lr_scheduler.step_update(start_steps + step)
    # gather the stats from all processes
    metric_logger.synchronize_between_processes()
    print("Averaged stats:", metric_logger)
    return {k: meter.global_avg for k, meter in metric_logger.meters.items()}
Esempio n. 11
0
def train_one_epoch(model: torch.nn.Module, criterion: torch.nn.Module,
                    data_loader: Iterable, optimizer: torch.optim.Optimizer,
                    device: torch.device, epoch: int, loss_scaler, max_norm: float = 0,
                    model_ema: Optional[ModelEma] = None, mixup_fn: Optional[Mixup] = None,
                    amp: bool = True, teacher_model: torch.nn.Module = None,
                    teach_loss: torch.nn.Module = None, distill_token: bool=False, choices=None, mode='super', retrain_config=None):
    model.train()
    criterion.train()

    # set random seed
    random.seed(epoch)

    metric_logger = utils.MetricLogger(delimiter="  ")
    metric_logger.add_meter('lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
    header = 'Epoch: [{}]'.format(epoch)
    print_freq = 10
    if mode == 'retrain':
        config = retrain_config
        model_module = unwrap_model(model)
        print(config)
        model_module.set_sample_config(config=config)
        print(model_module.get_sampled_params_numel(config))

    for samples, targets in metric_logger.log_every(data_loader, print_freq, header):
        samples = samples.to(device, non_blocking=True)
        targets = targets.to(device, non_blocking=True)

        # sample random config
        if mode == 'super':
            config = sample_configs(choices=choices)
            model_module = unwrap_model(model)
            model_module.set_sample_config(config=config)
        elif mode == 'retrain':
            config = retrain_config
            model_module = unwrap_model(model)
            model_module.set_sample_config(config=config)
        if mixup_fn is not None:
            samples, targets = mixup_fn(samples, targets)
        if amp:
            with torch.cuda.amp.autocast():
                if teacher_model:
                    with torch.no_grad():
                        teach_output = teacher_model(samples)
                    _, teacher_label = teach_output.topk(1, 1, True, True)
                    if distill_token:
                        output_cls, output_dis = model(samples)
                        loss = 1/2 * criterion(output_cls, targets) + 1/2 * teach_loss(output_dis, teacher_label.squeeze())
                    else:
                        outputs = model(samples)
                        loss = 1/2 * criterion(outputs, targets) + 1/2 * teach_loss(outputs, teacher_label.squeeze())
                else:
                    outputs = model(samples)
                    loss = criterion(outputs, targets)
        else:
            outputs = model(samples)
            if teacher_model:
                with torch.no_grad():
                    teach_output = teacher_model(samples)
                _, teacher_label = teach_output.topk(1, 1, True, True)
                loss = 1 / 2 * criterion(outputs, targets) + 1 / 2 * teach_loss(outputs, teacher_label.squeeze())
            else:
                loss = criterion(outputs, targets)

        loss_value = loss.item()

        if not math.isfinite(loss_value):
            print("Loss is {}, stopping training".format(loss_value))
            sys.exit(1)

        optimizer.zero_grad()

        # this attribute is added by timm on one optimizer (adahessian)
        if amp:
            is_second_order = hasattr(optimizer, 'is_second_order') and optimizer.is_second_order
            loss_scaler(loss, optimizer, clip_grad=max_norm,
                    parameters=model.parameters(), create_graph=is_second_order)
        else:
            loss.backward()
            optimizer.step()

        torch.cuda.synchronize()
        if model_ema is not None:
            model_ema.update(model)

        metric_logger.update(loss=loss_value)
        metric_logger.update(lr=optimizer.param_groups[0]["lr"])

    # gather the stats from all processes
    metric_logger.synchronize_between_processes()
    print("Averaged stats:", metric_logger)
    return {k: meter.global_avg for k, meter in metric_logger.meters.items()}
Esempio n. 12
0
    def train_one_epoch(self, lr_schedule='cyclic'):
        self.model.train()
        metric_logger = utils.MetricLogger(delimiter="  ")
        metric_logger.add_meter('lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
        header = 'Epoch: [{}]'.format(self.epoch)

        lr_scheduler = None
        if (self.epoch == 0):
            if lr_schedule == 'warmup':
                warmup_factor = 1. / 1000
                warmup_iters = min(1000, len(self.data_loader) - 1)

                lr_scheduler = utils.warmup_lr_scheduler(self.optimizer, warmup_iters, warmup_factor)
            elif lr_schedule == 'cyclic':
                lr_scheduler = torch.optim.lr_scheduler.CyclicLR(self.optimizer, 1e-6, 1e-2)

        for iteration, (images, targets) in enumerate(metric_logger.log_every(self.data_loader, self.print_freq, header)):
            with torch.autograd.detect_anomaly():
                images = list(image.to(self.device) for image in images)
                targets = [{k: v.to(self.device) for k, v in t.items()} for t in targets]

                loss_dict = self.model(images, targets)

                losses = sum(loss for loss in loss_dict.values())

                # reduce losses over all GPUs for logging purposes
                loss_dict_reduced = utils.reduce_dict(loss_dict)
                losses_reduced = sum(loss for loss in loss_dict_reduced.values())

                loss_value = losses_reduced.item()

                if self.emergency is True:
                    if not math.isfinite(loss_value):
                        print()
                        print("Loss is {}, stopping training".format(loss_value))
                        print(loss_dict_reduced)
                        sys.exit(1)

                self.optimizer.zero_grad()
                losses.backward()
                grad_norm = clip_grad_norm_(self.model.parameters(), grad_clip_norm_value)
                self.optimizer.step()

                if lr_scheduler is not None:
                    lr_scheduler.step()

                metric_logger.update(loss=losses_reduced, **loss_dict_reduced)
                metric_logger.update(lr=self.optimizer.param_groups[0]["lr"])

                if self.logger is not None:
                    if iteration % 50 == 0:
                        # 1. Log scalar values (scalar summary)
                        info = {'loss': losses_reduced, **loss_dict_reduced}

                        for tag, value in info.items():
                            self.logger.scalar_summary(tag, value, iteration+1)

                        # 2. Log values and gradients of the parameters (histogram summary)
                        for tag, value in self.model.named_parameters():
                            tag = tag.replace('.', '/')
                            self.logger.histo_summary(tag, value.data.cpu().numpy(), iteration+1)
                       
        self.epoch += 1         
Esempio n. 13
0
def train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq):
    model.train()
    metric_logger = utils.MetricLogger(delimiter="  ")
    metric_logger.add_meter(
        'lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
    header = 'Epoch: [{}]'.format(epoch + 1)

    lr_scheduler = None
    if epoch == 0:
        warmup_factor = 1. / 1000
        warmup_iters = min(1000, len(data_loader) - 1)

        lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters,
                                                 warmup_factor)

    info_dict = {
        "lr": [],
        "loss_comb": [],
        "loss_classifier": [],
        "loss_box_reg": [],
        "loss_objectness": [],
        "loss_rpn_box_reg": []
    }

    for images, targets in metric_logger.log_every(data_loader, print_freq,
                                                   header):
        images = list(image.to(device) for image in images)
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        loss_dict = model(images, targets)

        losses = sum(loss for loss in loss_dict.values())

        # reduce losses over all GPUs for logging purposes
        loss_dict_reduced = utils.reduce_dict(loss_dict)
        losses_reduced = sum(loss for loss in loss_dict_reduced.values())

        loss_value = losses_reduced.item()

        if not math.isfinite(loss_value):
            print("Loss is {}, stopping training".format(loss_value))
            print(loss_dict_reduced)
            sys.exit(1)

        optimizer.zero_grad()
        losses.backward()
        optimizer.step()

        if lr_scheduler is not None:
            lr_scheduler.step()

        metric_logger.update(loss=losses_reduced, **loss_dict_reduced)
        metric_logger.update(lr=optimizer.param_groups[0]["lr"])

        info_dict["loss_comb"].append(loss_value)
        for k in loss_dict_reduced.keys():
            info_dict[k].append(loss_dict_reduced[k].item())
        info_dict["lr"].append(optimizer.param_groups[0]["lr"])
    info_dict["total_loss"] = sum(info_dict["loss_comb"]) / len(
        info_dict["loss_comb"])
    return info_dict
Esempio n. 14
0
def train_SSL(model: torch.nn.Module,
              criterion,
              data_loader: Iterable,
              optimizer: torch.optim.Optimizer,
              device: torch.device,
              epoch: int,
              loss_scaler,
              max_norm: float = 0,
              model_ema: Optional[ModelEma] = None,
              mixup_fn: Optional[Mixup] = None):
    model.train(True)
    metric_logger = utils.MetricLogger(delimiter="  ")
    metric_logger.add_meter(
        'lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
    header = 'Epoch: [{}]'.format(epoch)
    print_freq = 50
    i = 0
    for imgs1, rots1, imgs2, rots2 in metric_logger.log_every(
            data_loader, print_freq, header):

        imgs1 = imgs1.to(device, non_blocking=True)
        imgs1_aug = distortImages(imgs1)  # Apply distortion
        rots1 = rots1.to(device, non_blocking=True)

        imgs2 = imgs2.to(device, non_blocking=True)
        imgs2_aug = distortImages(imgs2)
        rots2 = rots2.to(device, non_blocking=True)

        with torch.cuda.amp.autocast():

            rot1_p, contrastive1_p, imgs1_recon, r_w, cn_w, rec_w = model(
                imgs1_aug)
            rot2_p, contrastive2_p, imgs2_recon, _, _, _ = model(imgs2_aug)

            rot_p = torch.cat([rot1_p, rot2_p], dim=0)
            rots = torch.cat([rots1, rots2], dim=0)

            imgs_recon = torch.cat([imgs1_recon, imgs2_recon], dim=0)
            imgs = torch.cat([imgs1, imgs2], dim=0)

            loss, (loss1, loss2, loss3) = criterion(rot_p, rots,
                                                    contrastive1_p,
                                                    contrastive2_p, imgs_recon,
                                                    imgs, r_w, cn_w, rec_w)

        loss_value = loss.item()

        if not math.isfinite(loss_value):
            print("Loss is {}, stopping training".format(loss_value))
            sys.exit(1)

        optimizer.zero_grad()

        # this attribute is added by timm on one optimizer (adahessian)
        is_second_order = hasattr(
            optimizer, 'is_second_order') and optimizer.is_second_order
        loss_scaler(loss,
                    optimizer,
                    clip_grad=max_norm,
                    parameters=model.parameters(),
                    create_graph=is_second_order)

        torch.cuda.synchronize()
        if model_ema is not None:
            model_ema.update(model)

        metric_logger.update(loss=loss_value)
        metric_logger.update(RotationLoss=loss1.data.item())
        metric_logger.update(RotationScalar=r_w.data.item())
        metric_logger.update(ContrastiveLoss=loss2.data.item())
        metric_logger.update(ContrastiveScalar=cn_w.data.item())
        metric_logger.update(ReconstructionLoss=loss3.data.item())
        metric_logger.update(ReconstructionScalar=rec_w.data.item())
        metric_logger.update(lr=optimizer.param_groups[0]["lr"])

        i = i + 1
    # gather the stats from all processes
    metric_logger.synchronize_between_processes()
    print("Averaged stats:", metric_logger)
    return {k: meter.global_avg for k, meter in metric_logger.meters.items()}
Esempio n. 15
0
def train_one_epoch(model,
                    criterion,
                    optimizer,
                    data_loader,
                    device,
                    epoch,
                    print_freq,
                    apex=False):
    model.train()
    metric_logger = utils.MetricLogger(delimiter="  ", device=device)
    metric_logger.add_meter('lr',
                            utils.SmoothedValue(window_size=1, fmt='{value}'))
    metric_logger.add_meter('img/s',
                            utils.SmoothedValue(window_size=10, fmt='{value}'))

    header = 'Epoch: [{}]'.format(epoch)
    step_count = 0
    last_print_time = time.time()

    for image, target in metric_logger.log_every(data_loader, print_freq,
                                                 header):
        image, target = image.to(device, non_blocking=True), target.to(
            device, non_blocking=True)

        dl_ex_start_time = time.time()

        if args.channels_last:
            image = image.contiguous(memory_format=torch.channels_last)

            if args.run_lazy_mode:
                # This mark_step is added so that the the lazy kernel can
                # create and evaluate the graph to infer the resulting tensor
                # as channels_last
                import habana_frameworks.torch.core as htcore
                htcore.mark_step()
        output = model(image)
        loss = criterion(output, target)
        optimizer.zero_grad()

        # We see the performance gain of mobilenet via added this mark_step.
        if (args.run_lazy_mode and 'mobilenet_v2' in args.model):
            import habana_frameworks.torch.core as htcore
            htcore.mark_step()

        if apex:
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
        else:
            loss.backward()

        if args.run_lazy_mode:
            import habana_frameworks.torch.core as htcore
            htcore.mark_step()

        optimizer.step()

        if args.run_lazy_mode:
            import habana_frameworks.torch.core as htcore
            htcore.mark_step()
        if step_count % print_freq == 0:
            output_cpu = output.detach().to('cpu')
            acc1, acc5 = utils.accuracy(output_cpu, target, topk=(1, 5))
            batch_size = image.shape[0]
            metric_logger.update(loss=loss.item(),
                                 lr=optimizer.param_groups[0]["lr"])
            metric_logger.meters['acc1'].update(acc1.item(),
                                                n=batch_size * print_freq)
            metric_logger.meters['acc5'].update(acc5.item(),
                                                n=batch_size * print_freq)
            current_time = time.time()
            last_print_time = dl_ex_start_time if args.dl_time_exclude else last_print_time
            metric_logger.meters['img/s'].update(
                batch_size * print_freq / (current_time - last_print_time))
            last_print_time = time.time()

        step_count = step_count + 1
        if step_count >= args.num_train_steps:
            break
Esempio n. 16
0
def train_one_epoch(
        model,
        arch,
        optimizer,
        lr_scheduler,
        data_loader,
        device,
        epoch,
        print_freq,
        ngpus_per_node,
        model_without_ddp,
        args
    ):
    model.train()
    metric_logger = utils.MetricLogger(delimiter="  ")
    metric_logger.add_meter("lr", utils.SmoothedValue(window_size=1, fmt="{value:.6f}"))
    # header = "Epoch: [{}]".format(epoch)

    for images, targets in metric_logger.log_every(
            iterable=data_loader,
            print_freq=print_freq,
            # header=header,
            iter_num=args.iter_num
        ):

        images = list(image.to(device) for image in images)
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        """
        [{"boxes": tensor([], device="cuda:0"), "labels": tensor([], device="cuda:0", dtype=torch.int64), "masks": tensor([], device="cuda:0", dtype=torch.uint8), "iscrowd": tensor([], device="cuda:0", dtype=torch.int64)}]
        """

        try:
            loss_dict = model(images, targets) 
            losses = sum(loss for loss in loss_dict.values())

            # reduce losses over all GPUs for logging purposes
            loss_dict_reduced = utils.reduce_dict(loss_dict)
            losses_reduced = sum(loss for loss in loss_dict_reduced.values())
            loss_value = losses_reduced.item()

            if not math.isfinite(loss_value):
                logger.fatal("Loss is {}, stopping training".format(loss_value))
                logger.fatal(loss_dict_reduced)
                sys.exit(1)

            optimizer.zero_grad()
            losses.backward()
            optimizer.step()
            lr_scheduler.step()

            metric_logger.update(loss=losses_reduced, **loss_dict_reduced)
            metric_logger.update(lr=optimizer.param_groups[0]["lr"])

        except Exception as e:
            logger.warning(e, exc_info=True)
            # logger.info("print target for debug")
            # print(targets)

        args.iter_num += 1

        # save checkpoint here
        if not args.multiprocessing_distributed or (args.multiprocessing_distributed and args.rank % ngpus_per_node == 0):
            if args.iter_num % 1000 == 0:
                utils.save_on_master({
                        "model": model_without_ddp.state_dict(),
                        "optimizer": optimizer.state_dict(),
                        "lr_scheduler": lr_scheduler.state_dict(),
                        "epoch": epoch,
                        "iter_num": args.iter_num,
                        "args": args,
                    },
                    "{}/{}_{}.pth".format(checkpoint_dir, arch.__name__, args.iter_num)
                )

                os.makedirs("{}/debug_image/".format(checkpoint_dir), exist_ok=True)

                if args.iter_num < 5000:
                    continue

                model.eval()

                # from barez import overlay_ann	
                debug_image = None
                debug_image_list = []
                cnt = 0
                for image_path in glob.glob("./table_test/*"):
                    cnt += 1
                    image_name = os.path.basename(image_path)
                    # print(image_name)
                    image = cv2.imread(image_path)
                    rat = 1300 / image.shape[0]
                    image = cv2.resize(image, None, fx=rat, fy=rat)

                    transform = transforms.Compose([transforms.ToTensor()])
                    image = transform(image)

                    # put the model in evaluation mode
                    with torch.no_grad():
                        tensor = [image.to(device)]
                        prediction = model(tensor)
                        
                    image = torch.squeeze(image, 0).permute(1, 2, 0).mul(255).numpy().astype(np.uint8)

                    for pred in prediction:
                        for idx, mask in enumerate(pred['masks']):
                            if pred['scores'][idx].item() < 0.5:
                                continue
                        
                            m =  mask[0].mul(255).byte().cpu().numpy()
                            box = list(map(int, pred["boxes"][idx].tolist())) 
                            score = pred["scores"][idx].item()
                            # image = overlay_ann(image, m, box, "", score)

                    # if debug_image is None:
                    #     debug_image = image
                    # else:
                    #     debug_image = np.concatenate((debug_image, image), axis=1)

                    # if cnt == 10:
                    #     cnt = 0
                    #     debug_image_list.append(debug_image)
                    #     debug_image = None
                    
                avg_length = np.mean([i.shape[1] for i in debug_image_list])

                
                di = None

                
                for debug_image in debug_image_list:
                    rat = avg_length / debug_image.shape[1]
                    debug_image = cv2.resize(debug_image, None, fx=rat, fy=rat)

                    if di is None:
                        di = debug_image
                    else:
                        
                        di = np.concatenate((di, debug_image), axis=0)
            

                di = cv2.resize(di, None, fx=0.4, fy=0.4)
                cv2.imwrite("{}/debug_image/{}.jpg".format(checkpoint_dir, args.iter_num), di)

                model.train()

        # hard stop
        if args.iter_num == 50000:
            logger.info("ITER NUM == 50k, training successfully!")
            raise SystemExit
Esempio n. 17
0
def train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq,
                    tb_writer):
    model.train()
    metric_logger = utils.MetricLogger(delimiter="  ")
    metric_logger.add_meter(
        "lr", utils.SmoothedValue(window_size=1, fmt="{value:.6f}"))
    header = "Epoch: [{}]".format(epoch)

    lr_scheduler = None
    if epoch == 0:
        warmup_factor = 1.0 / 1000
        warmup_iters = min(1000, len(data_loader) - 1)

        lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters,
                                                 warmup_factor)

    for images, targets in metric_logger.log_every(data_loader, print_freq,
                                                   header):
        try:
            targets = [{k: v.to(device)
                        for k, v in t.items()} for t in targets
                       if t["boxes"].shape[0] > 0]
            images = list(
                image.to(device) for image, t in zip(images, targets)
                if t["boxes"].shape[0] > 0)
        except:
            print("neeeee")
            # breakpoint()

        try:
            loss_dict = model(images, targets)
        except:
            print("daaaaa")
            # breakpoint()

        losses = sum(loss for loss in loss_dict.values())

        # reduce losses over all GPUs for logging purposes
        loss_dict_reduced = utils.reduce_dict(loss_dict)

        losses_reduced = sum(loss for loss in loss_dict_reduced.values())

        loss_value = losses_reduced.item()

        if not math.isfinite(loss_value):
            print("Loss is {}, stopping training".format(loss_value))
            print(loss_dict_reduced)
            sys.exit(1)

        optimizer.zero_grad()
        losses.backward()
        optimizer.step()

        if lr_scheduler is not None:
            lr_scheduler.step()

        metric_logger.update(loss=losses_reduced,
                             epoch=tb_writer["step"],
                             tb_writer=tb_writer["writer"],
                             **loss_dict_reduced)
        metric_logger.update(
            lr=optimizer.param_groups[0]["lr"],
            epoch=tb_writer["step"],
            tb_writer=tb_writer["writer"],
        )
        tb_writer["step"] += 1
Esempio n. 18
0
        fine_train_dataset = load_dataset(train_imgs, fine_tr, bs)
        coarse_train_dataset = load_dataset(train_imgs, coarse_tr, bs)
        for e in range(epochs):
            print('Starting training for %g epochs...' % e)

            fine_train_loader = load_dataloader(bs, fine_train_dataset)
            coarse_train_loader = load_dataloader(bs, coarse_train_dataset)

            fine_train_nb = len(fine_train_loader)
            coarse_train_nb = len(coarse_train_loader)
            assert fine_train_nb == coarse_train_nb, 'fine & coarse train batch number is not matched'
            nb = fine_train_nb

            # Logger
            fine_metric_logger = utils.MetricLogger(delimiter="  ")
            fine_metric_logger.add_meter('lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
            coarse_metric_logger = utils.MetricLogger(delimiter="  ")
            coarse_metric_logger.add_meter('lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
            fine_header = 'Fine Epoch: [{}]'.format(e)
            coarse_header = 'Coarse Epoch: [{}]'.format(e)

            #     # warmup
            fine_lr_scheduler = None
            corase_lr_scheduler = None
            if e == 0:
                warmup_factor = 1. / 1000
                warmup_iters = min(1000, fine_train_nb - 1)
                fine_lr_scheduler = utils.warmup_lr_scheduler(fine_optim, warmup_iters, warmup_factor)
                coarse_lr_scheduler = utils.warmup_lr_scheduler(coarse_optim, warmup_iters, warmup_factor)

            for i, (fine_train, coarse_train) in enumerate(zip(fine_train_loader, coarse_train_loader)):
Esempio n. 19
0
def train(model, loss_fn, optimizer, data_loader_train, data_loader_test,
          scaled_lr):
    """Train and evaluate the model

    Args:
        model (dlrm):
        loss_fn (torch.nn.Module): Loss function
        optimizer (torch.nn.optim):
        data_loader_train (torch.utils.data.DataLoader):
        data_loader_test (torch.utils.data.DataLoader):
        scaled_lr (float)
    """
    # Print per 16384 * 2000 samples by default
    default_print_freq = 16384 * 2000 // FLAGS.batch_size
    print_freq = default_print_freq if FLAGS.print_freq is None else FLAGS.print_freq

    steps_per_epoch = len(data_loader_train)
    # MLperf requires 20 tests per epoch
    test_freq = FLAGS.test_freq if FLAGS.test_freq is not None else steps_per_epoch // 20

    metric_logger = utils.MetricLogger(delimiter="  ")
    metric_logger.add_meter(
        'loss', utils.SmoothedValue(window_size=print_freq, fmt='{avg:.4f}'))
    metric_logger.add_meter(
        'step_time', utils.SmoothedValue(window_size=1, fmt='{avg:.4f}'))
    metric_logger.add_meter(
        'lr', utils.SmoothedValue(window_size=1, fmt='{value:.4f}'))

    lr_scheduler = utils.LearningRateScheduler(
        optimizers=[optimizer],
        base_lrs=[scaled_lr],
        warmup_steps=FLAGS.warmup_steps,
        warmup_factor=FLAGS.warmup_factor,
        decay_start_step=FLAGS.decay_start_step,
        decay_steps=FLAGS.decay_steps,
        decay_power=FLAGS.decay_power,
        end_lr_factor=FLAGS.decay_end_lr / FLAGS.lr)

    step = 0
    start_time = time()
    stop_time = time()
    for epoch in range(FLAGS.epochs):
        epoch_start_time = time()

        for numerical_features, categorical_features, click in data_loader_train:
            global_step = steps_per_epoch * epoch + step
            lr_scheduler.step()

            output = model(numerical_features, categorical_features).squeeze()
            loss = loss_fn(output, click)

            optimizer.zero_grad()
            if FLAGS.fp16:
                loss *= FLAGS.loss_scale
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()
            optimizer.step()

            # Cancel loss scale for logging if fp16 is used
            metric_logger.update(loss=loss.item() /
                                 (FLAGS.loss_scale if FLAGS.fp16 else 1),
                                 lr=optimizer.param_groups[0]["lr"] *
                                 (FLAGS.loss_scale if FLAGS.fp16 else 1))

            if step % print_freq == 0:
                # Averaging cross a print_freq period to reduce the error.
                # An accurate timing needs synchronize which would slow things down.
                metric_logger.update(step_time=(time() - stop_time) /
                                     print_freq)
                stop_time = time()
                eta_str = datetime.timedelta(
                    seconds=int(metric_logger.step_time.global_avg *
                                (steps_per_epoch - step)))
                metric_logger.print(
                    header=
                    F"Epoch:[{epoch}/{FLAGS.epochs}] [{step}/{steps_per_epoch}]  eta: {eta_str}"
                )

            if global_step % test_freq == 0 and global_step > 0 and global_step / steps_per_epoch >= FLAGS.test_after:
                loss, auc = evaluate(model, loss_fn, data_loader_test)
                print(
                    F"Epoch {epoch} step {step}. Test loss {loss:.4f}, auc {auc:.6f}"
                )
                stop_time = time()

                if auc >= FLAGS.auc_threshold:
                    run_time_s = int(stop_time - start_time)
                    print(
                        F"Hit target accuracy AUC {FLAGS.auc_threshold} at epoch "
                        F"{global_step/steps_per_epoch:.2f} in {run_time_s}s. "
                        F"Average speed {global_step * FLAGS.batch_size / run_time_s:.1f} records/s."
                    )
                    return
            step += 1

        epoch_stop_time = time()
        epoch_time_s = epoch_stop_time - epoch_start_time
        print(
            F"Finished epoch {epoch} in {datetime.timedelta(seconds=int(epoch_time_s))}. "
            F"Average speed {steps_per_epoch * FLAGS.batch_size / epoch_time_s:.1f} records/s."
        )
Esempio n. 20
0
def train_one_epoch(model, criterion, optimizer, data_loader, lr_scheduler,
                    device, epoch, args, print_freq, logger, iterations,
                    bert_model, baseline_model):
    model.train()
    metric_logger = utils.MetricLogger(delimiter="  ")
    metric_logger.add_meter('lr',
                            utils.SmoothedValue(window_size=1, fmt='{value}'))
    header = 'Epoch: [{}]'.format(epoch)
    train_loss = 0
    total_its = 0
    train_emb_loss = 0
    train_seg_loss = 0

    for data in metric_logger.log_every(data_loader, print_freq, header):

        total_its += 1

        image, target, sentences, attentions = data
        image, target, sentences, attentions = image.to(device), target.to(
            device), sentences.to(device), attentions.to(device)

        sentences = sentences.squeeze(1)
        attentions = attentions.squeeze(1)

        if args.baseline_bilstm:

            num_tokens = torch.sum(attentions, dim=-1)
            unbinded_sequences = list(torch.unbind(sentences, dim=0))
            processed_seqs = [
                seq[:num_tokens[i], :]
                for i, seq in enumerate(unbinded_sequences)
            ]

            packed_sentences = torch.nn.utils.rnn.pack_sequence(
                processed_seqs, enforce_sorted=False)
            hidden_states, cell_states = baseline_model[0](packed_sentences)
            hidden_states = torch.nn.utils.rnn.pad_packed_sequence(
                hidden_states, batch_first=True, total_length=20)

            hidden_states = hidden_states[0]

            unbinded_hidden_states = list(torch.unbind(hidden_states, dim=0))

            processed_hidden_states = [
                seq[:num_tokens[i], :]
                for i, seq in enumerate(unbinded_hidden_states)
            ]

            mean_hidden_states = [
                torch.mean(seq, dim=0).unsqueeze(0)
                for seq in processed_hidden_states
            ]
            last_hidden_states = torch.cat(mean_hidden_states, dim=0)

            last_hidden_states = baseline_model[1](last_hidden_states)
            last_hidden_states = last_hidden_states.unsqueeze(1)

        else:

            last_hidden_states = bert_model(sentences,
                                            attention_mask=attentions)[0]

        embedding = last_hidden_states[:, 0, :]
        output, vis_emb, lan_emb = model(image, embedding.squeeze(1))

        loss = criterion(output, target, args)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if args.linear_lr:
            adjust_learning_rate(optimizer, epoch, args)
        else:
            lr_scheduler.step()

        train_loss += loss.item()
        iterations += 1

        metric_logger.update(loss=loss.item(),
                             lr=optimizer.param_groups[0]["lr"])

        del image, target, sentences, attentions, loss, embedding, output, vis_emb, lan_emb, last_hidden_states, data

        gc.collect()
        torch.cuda.empty_cache()

    train_loss = train_loss / total_its

    logger.scalar_summary('loss', train_loss, epoch)
    logger.scalar_summary('lr', optimizer.param_groups[0]["lr"], epoch)
Esempio n. 21
0
def train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq):
    model.train()
    metric_logger = utils.MetricLogger(delimiter="  ")
    metric_logger.add_meter('lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
    header = 'Epoch: [{}]'.format(epoch)

    lr_scheduler = None
    if epoch == 0:
        warmup_factor = 1. / 1000
        warmup_iters = min(1000, len(data_loader) - 1)

        lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters, warmup_factor)

    for i in metric_logger.log_every(data_loader, print_freq, header):

        try:
            images, targets = i
            '''Burası değiştirilecek'''
            targets["boxes"] = targets["boxes"].to(device)
            targets["labels"] = targets["labels"].to(device)
            targets["boxes"].squeeze_()
            targets["labels"].squeeze_()
            targets1 = [{k: v for k, v in targets.items()}]
            
            images = images.to(device)
            targets = targets1
            # zero the parameter gradients

            # forward
            # track history if only in train
            #images = list(image.to(device) for image in images)
            #targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

            loss_dict = model(images, targets)

            losses = sum(loss for loss in loss_dict.values())

            # reduce losses over all GPUs for logging purposes
            loss_dict_reduced = utils.reduce_dict(loss_dict)
            losses_reduced = sum(loss for loss in loss_dict_reduced.values())

            loss_value = losses_reduced.item()
            #print(targets[0]["boxes"])
            if not math.isfinite(loss_value):
                print(images.size())
                print(targets[0]["boxes"])
                print("Loss is {}, stopping training".format(loss_value))
                print(loss_dict_reduced)
                sys.exit(1)

            optimizer.zero_grad()
            losses.backward()
            optimizer.step()

            if lr_scheduler is not None:
                lr_scheduler.step()

            metric_logger.update(loss=losses_reduced, **loss_dict_reduced)
            metric_logger.update(lr=optimizer.param_groups[0]["lr"])
        except ValueError:
            continue
            
    return metric_logger
Esempio n. 22
0
def my_train_one_epoch(model: torch.nn.Module,
                       criterion: DistillationLoss,
                       data_loader: Iterable,
                       optimizer: torch.optim.Optimizer,
                       device: torch.device,
                       epoch: int,
                       loss_scaler,
                       max_norm: float = 0,
                       model_ema: Optional[ModelEma] = None,
                       mixup_fn: Optional[Mixup] = None,
                       set_training_mode=True,
                       fp32=False):
    model.train(set_training_mode)
    metric_logger = utils.MetricLogger(delimiter="  ")
    metric_logger.add_meter(
        'lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
    header = 'Epoch: [{}]'.format(epoch)
    print_freq = 10

    # prefetcher = data_prefetcher(data_loader, device, prefetch=True)
    prefetcher = DataPrefetcher(data_loader)
    samples, targets = prefetcher.next()

    for _ in metric_logger.log_every(range(len(data_loader)), print_freq,
                                     header):

        samples = samples.to(device, non_blocking=True)
        targets = targets.to(device, non_blocking=True)

        if mixup_fn is not None:
            samples, targets = mixup_fn(samples, targets)

        # with torch.cuda.amp.autocast():
        #     outputs = model(samples)
        #     loss = criterion(samples, outputs, targets)
        with torch.cuda.amp.autocast(enabled=not fp32):
            outputs = model(samples)
            loss = criterion(samples, outputs, targets)

        loss_value = loss.item()

        if not math.isfinite(loss_value):
            print("Loss is {}, stopping training".format(loss_value))
            sys.exit(1)

        optimizer.zero_grad()

        # this attribute is added by timm on one optimizer (adahessian)
        is_second_order = hasattr(
            optimizer, 'is_second_order') and optimizer.is_second_order
        loss_scaler(loss,
                    optimizer,
                    clip_grad=max_norm,
                    parameters=model.parameters(),
                    create_graph=is_second_order)

        torch.cuda.synchronize()
        if model_ema is not None:
            model_ema.update(model)

        metric_logger.update(loss=loss_value)
        metric_logger.update(lr=optimizer.param_groups[0]["lr"])

        samples, targets = prefetcher.next()

    # gather the stats from all processes
    metric_logger.synchronize_between_processes()
    print("Averaged stats:", metric_logger)
    return {k: meter.global_avg for k, meter in metric_logger.meters.items()}
Esempio n. 23
0
def train_one_epoch(model: torch.nn.Module,
                    criterion: torch.nn.Module,
                    data_loader: Iterable,
                    optimizer: torch.optim.Optimizer,
                    device: torch.device,
                    epoch: int,
                    loss_scaler,
                    max_norm: float = 0,
                    model_ema: Optional[ModelEma] = None,
                    mixup_fn: Optional[Mixup] = None):
    # TODO fix this for finetuning
    model.train()
    criterion.train()
    end = time.time()
    metric_logger = utils.MetricLogger(delimiter="  ")
    metric_logger.add_meter(
        'lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
    header = 'Epoch: [{}]'.format(epoch)
    print_freq = 50

    for samples, targets in metric_logger.log_every(data_loader, print_freq,
                                                    header):
        samples = samples.to(device, non_blocking=True)
        targets = targets.to(device, non_blocking=True)
        metric_logger.update(data_time=time.time() - end)
        if mixup_fn is not None:
            samples, targets = mixup_fn(samples, targets)

        end = time.time()
        with torch.cuda.amp.autocast():
            outputs = model(samples)
            loss = criterion(outputs, targets)

        loss_value = loss.item()

        if not math.isfinite(loss_value):
            print("Loss is {}, stopping training".format(loss_value))
            sys.exit(1)

        optimizer.zero_grad()

        # this attribute is added by timm on one optimizer (adahessian)
        is_second_order = hasattr(
            optimizer, 'is_second_order') and optimizer.is_second_order
        loss_scaler(loss,
                    optimizer,
                    clip_grad=max_norm,
                    parameters=model.parameters(),
                    create_graph=is_second_order)
        batch_time = time.time() - end
        torch.cuda.synchronize()
        if model_ema is not None:
            model_ema.update(model)

        metric_logger.update(batch_time=batch_time)
        metric_logger.update(throughput=samples.size(0) / batch_time)
        metric_logger.update(loss=loss_value)
        metric_logger.update(lr=optimizer.param_groups[0]["lr"])

    # gather the stats from all processes
    metric_logger.synchronize_between_processes()
    print("Averaged stats:", metric_logger)
    return {k: meter.global_avg for k, meter in metric_logger.meters.items()}
Esempio n. 24
0
def train_one_epoch(model,
                    optimizer,
                    data_loader,
                    device,
                    epoch,
                    print_freq,
                    mode='normal'):
    model.train()
    metric_logger = utils.MetricLogger(delimiter="  ")
    metric_logger.add_meter(
        'lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
    header = 'Epoch: [{}]'.format(epoch)

    lr_scheduler = None
    if epoch == 0:
        warmup_factor = 1. / 1000
        warmup_iters = min(1000, len(data_loader) - 1)

        lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters,
                                                 warmup_factor)

    if mode == 'postFusion':
        for images, motion, targets in metric_logger.log_every(
                data_loader, print_freq, header):
            images = list(image.to(device) for image in images)
            motion = list(m.to(device) for m in motion)
            targets = [{k: v.to(device)
                        for k, v in t.items()} for t in targets]

            loss_dict = model([images, motion], targets)

            losses = sum(loss for loss in loss_dict.values())

            # reduce losses over all GPUs for logging purposes
            loss_dict_reduced = utils.reduce_dict(loss_dict)
            losses_reduced = sum(loss for loss in loss_dict_reduced.values())

            loss_value = losses_reduced.item()

            if not math.isfinite(loss_value):
                print("Loss is {}, stopping training".format(loss_value))
                print(loss_dict_reduced)
                sys.exit(1)

            optimizer.zero_grad()
            losses.backward()
            optimizer.step()

            if lr_scheduler is not None:
                lr_scheduler.step()

            metric_logger.update(loss=losses_reduced, **loss_dict_reduced)
            metric_logger.update(lr=optimizer.param_groups[0]["lr"])

    else:
        for images, targets in metric_logger.log_every(data_loader, print_freq,
                                                       header):
            images = list(image.to(device) for image in images)
            targets = [{k: v.to(device)
                        for k, v in t.items()} for t in targets]

            loss_dict = model(images, targets)

            losses = sum(loss for loss in loss_dict.values())

            # reduce losses over all GPUs for logging purposes
            loss_dict_reduced = utils.reduce_dict(loss_dict)
            losses_reduced = sum(loss for loss in loss_dict_reduced.values())

            loss_value = losses_reduced.item()

            if not math.isfinite(loss_value):
                print("Loss is {}, stopping training".format(loss_value))
                print(loss_dict_reduced)
                sys.exit(1)

            optimizer.zero_grad()
            losses.backward()
            optimizer.step()

            if lr_scheduler is not None:
                lr_scheduler.step()

            metric_logger.update(loss=losses_reduced, **loss_dict_reduced)
            metric_logger.update(lr=optimizer.param_groups[0]["lr"])

    return metric_logger
Esempio n. 25
0
def train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq):
    model.train()
    metric_logger = utils.MetricLogger(delimiter="  ")
    metric_logger.add_meter(
        'lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
    header = 'Epoch: [{}]'.format(epoch)

    lr_scheduler = None
    if epoch == 0:
        warmup_factor = 1. / 1000
        warmup_iters = min(1000, len(data_loader) - 1)

        lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters,
                                                 warmup_factor)

    loss_plt = []
    for images, ann in metric_logger.log_every(data_loader, print_freq,
                                               header):
        targets = []
        for data1 in ann:  #这个for循环可以舍去
            boxes = []
            target = {}
            labels = []
            for d in data1:
                box = d['bbox']
                box = [box[0], box[1], box[0] + box[2], box[1] + box[3]]
                boxes.append(box)
                labels.append(d['category_id'])
                # convert everything into a torch.Tensor
            boxes = torch.as_tensor(boxes, dtype=torch.float32)
            # there is only one class
            labels = torch.as_tensor(labels, dtype=torch.int64)
            image_id = torch.tensor([data1[0]['image_id']])
            area = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
            #print(area)
            #return
            iscrowd = torch.zeros((len(data1), ), dtype=torch.int64)
            # suppose all instances are not crowd
            target["boxes"] = boxes
            target["labels"] = labels
            target["image_id"] = image_id
            target["area"] = area
            target["iscrowd"] = iscrowd
            targets.append(target)
        images = list(image.to(device) for image in images)
        targets = [{k: v.to(device)
                    for k, v in t.items()}
                   for t in targets]  #假设标签没有放大相应device上??

        loss_dict = model(images, targets)

        losses = sum(loss for loss in loss_dict.values())
        loss_plt.append(losses)
        # reduce losses over all GPUs for logging purposes
        loss_dict_reduced = utils.reduce_dict(loss_dict)
        losses_reduced = sum(loss for loss in loss_dict_reduced.values())

        loss_value = losses_reduced.item()

        if not math.isfinite(loss_value):
            print("Loss is {}, stopping training".format(loss_value))
            print(loss_dict_reduced)
            sys.exit(1)

        optimizer.zero_grad()
        losses.backward()
        optimizer.step()

        if lr_scheduler is not None:
            lr_scheduler.step()

        metric_logger.update(loss=losses_reduced, **loss_dict_reduced)
        metric_logger.update(lr=optimizer.param_groups[0]["lr"])
        #break

    return metric_logger, loss_plt
Esempio n. 26
0
def train_one_epoch(model,
                    optimizer,
                    data_loader,
                    device,
                    epoch,
                    print_freq,
                    compression_scheduler=None):
    model.train()
    metric_logger = utils.MetricLogger(delimiter="  ")
    metric_logger.add_meter(
        'lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
    header = 'Epoch: [{}]'.format(epoch)
    steps_per_epoch = len(data_loader)
    lr_scheduler = None
    if epoch == 0:
        warmup_factor = 1. / 1000
        warmup_iters = min(1000, len(data_loader) - 1)

        lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters,
                                                 warmup_factor)

    for train_step, (images, targets) in enumerate(
            metric_logger.log_every(data_loader, print_freq, header)):
        if compression_scheduler:
            compression_scheduler.on_minibatch_begin(epoch, train_step,
                                                     steps_per_epoch,
                                                     optimizer)

        images = list(image.to(device) for image in images)
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        loss_dict = model(images, targets)

        losses = sum(loss for loss in loss_dict.values())

        # reduce losses over all GPUs for logging purposes
        loss_dict_reduced = utils.reduce_dict(loss_dict)
        losses_reduced = sum(loss for loss in loss_dict_reduced.values())

        loss_value = losses_reduced.item()

        if not math.isfinite(loss_value):
            print("Loss is {}, stopping training".format(loss_value))
            print(loss_dict_reduced)
            sys.exit(1)
        if compression_scheduler:
            losses = compression_scheduler.before_backward_pass(
                epoch,
                train_step,
                steps_per_epoch,
                losses,
                optimizer=optimizer)

        optimizer.zero_grad()
        losses.backward()

        if compression_scheduler:
            compression_scheduler.before_parameter_optimization(
                epoch, train_step, steps_per_epoch, optimizer)
        optimizer.step()

        if compression_scheduler:
            compression_scheduler.on_minibatch_end(epoch, train_step,
                                                   steps_per_epoch, optimizer)

        if lr_scheduler is not None:
            lr_scheduler.step()

        metric_logger.update(loss=losses_reduced, **loss_dict_reduced)
        metric_logger.update(lr=optimizer.param_groups[0]["lr"])
Esempio n. 27
0
def train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq):
    model.train()
    metric_logger = utils.MetricLogger(delimiter="  ")
    metric_logger.add_meter(
        'lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
    header = 'Epoch: [{}]'.format(epoch)

    lr_scheduler = None
    if epoch == 0:
        warmup_factor = 1. / 1000
        warmup_iters = min(1000, len(data_loader) - 1)

        lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters,
                                                 warmup_factor)

    for images, targets in metric_logger.log_every(data_loader, print_freq,
                                                   header):
        images = list(image.to(device)
                      for image in images)  #.to(device) for both
        targets = [{k: v.to(device)
                    for k, v in t.items()}
                   for t in targets]  #.to(device) for both

        loss_dict = model(images, targets)
        '''    
    During training, the model expects both the input tensors, as well as a targets (list of dictionary),
    containing:
        - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
          ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
        - labels (Int64Tensor[N]): the class label for each ground-truth box
        - masks (UInt8Tensor[N, H, W]): the segmentation binary masks for each instance
    The model returns a Dict[Tensor] during training, containing the classification and regression
    losses for both the RPN and the R-CNN, and the mask loss.
    
    During inference, the model requires only the input tensors, and returns the post-processed
    predictions as a List[Dict[Tensor]], one for each input image. The fields of the Dict are as
    follows:
        - boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x1, y1, x2, y2]`` format, with
          ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
        - labels (Int64Tensor[N]): the predicted labels for each image
        - scores (Tensor[N]): the scores or each prediction
        - masks (UInt8Tensor[N, 1, H, W]): the predicted masks for each instance, in 0-1 range. In order to
          obtain the final segmentation masks, the soft masks can be thresholded, generally
          with a value of 0.5 (mask >= 0.5)
          '''

        losses = sum(loss for loss in loss_dict.values())

        # reduce losses over all GPUs for logging purposes
        loss_dict_reduced = utils.reduce_dict(loss_dict)
        losses_reduced = sum(loss for loss in loss_dict_reduced.values())

        loss_value = losses_reduced.item()

        if not math.isfinite(loss_value):
            print("Loss is {}, stopping training".format(loss_value))
            print(loss_dict_reduced)
            sys.exit(1)

        optimizer.zero_grad()
        losses.backward()
        optimizer.step()

        if lr_scheduler is not None:
            lr_scheduler.step()

        metric_logger.update(loss=losses_reduced, **loss_dict_reduced)
        metric_logger.update(lr=optimizer.param_groups[0]["lr"])

    return metric_logger
Esempio n. 28
0
def train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq):
    model.train()
    metric_logger = utils.MetricLogger(delimiter="  ")
    metric_logger.add_meter('lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
    header = 'Epoch: [{}]'.format(epoch)

    lr_scheduler = None
    if epoch == 0:
        warmup_factor = 1. / 1000
        warmup_iters = min(1000, len(data_loader) - 1)

        lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters, warmup_factor)

    for images, targets in metric_logger.log_every(data_loader, print_freq, header):
        images = list(image.to(device) for image in images)

        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
        ts = copy.deepcopy(targets)
        # print(f"targets before model: {targets[0]['boxes']}")
        # print(f"n images: {len(images)}\nn boxes: {targets[0]['boxes'].shape}\nn labels: {targets[0]['labels'].shape}\nn masks: {targets[0]['masks'].shape}\n")
        loss_dict = model(images, targets)
        print(loss_dict)
        # print(f"targets after model: {targets[0]['boxes']}")
        losses = sum(loss for loss in loss_dict.values())
        # print(losses)
        # if losses.item() > 1:
        #     single_image = np.transpose(images[0].cpu().detach().numpy(),(1,2,0)).squeeze()
        #     fig = plt.figure()
        #     ax = fig.add_subplot(111, aspect='equal')
        #     ax.imshow(single_image)
        #     # print(np.unique(single_image))
        #     # cvimg = cv2.imread(img_path)
        #     # print(single_image.shape)
        #     # plt.imshow(single_image)
        #     # plt.show()
        #     # cvimg = np.uint8(single_image*255)
        #     # print(cvimg.shape)
        #     # cvimg = cvimg.astype(int)
            
        #     # r,g,b = cv2.split(cvimg)
        #     # cvimg = cv2.merge([b,g,r])
        #     # print(cvimg)
        #     # print(targets[0]['boxes'])
        #     # for box in ts[0]['boxes']:
        #     for box in targets[0]['boxes']:
        #         # print(f"dict: {dict}")
        #         # box = dict['boxes']
        #         # print(f"box: {box}")
        #         # box = box.item()

        #         x1 = box[0].item()
        #         y1 = box[1].item()
        #         x2 = box[2].item()
        #         y2 = box[3].item()
        #         # print(box)
        #         # print(f"x1:{x1} y1:{y1} x2:{x2} y2:{y2}")
                
        #         rect = patches.Rectangle((x1,y1),x2-x1,y2-y1,fill=False,edgecolor='r')
        #         ax.add_patch(rect)
                # cv2.rectangle(cvimg,(x1,y1),(x2,y2),(255,255,0))
            # plt.show()
            
        # reduce losses over all GPUs for logging purposes
        loss_dict_reduced = utils.reduce_dict(loss_dict)
        # print(loss_dict_reduced)
        losses_reduced = sum(loss for loss in loss_dict_reduced.values())
        # print(losses_reduced)
        loss_value = losses_reduced.item()

        if not math.isfinite(loss_value):
            # visualize_bboxes(images,targets)
            print("Loss is {}, stopping training".format(loss_value))
            print(loss_dict_reduced)
            sys.exit(1)

        optimizer.zero_grad()
        losses.backward()
        optimizer.step()

        if lr_scheduler is not None:
            lr_scheduler.step()

        metric_logger.update(loss=losses_reduced, **loss_dict_reduced)
        metric_logger.update(lr=optimizer.param_groups[0]["lr"])