def loss_function(self, classification, regression, anchors, annotations,
                   **kwargs) -> dict:
     criterion = FocalLoss()
     cls_loss, reg_loss = criterion(classification, regression, anchors,
                                    annotations)
     loss = cls_loss + reg_loss
     return {'loss': loss, 'cls_loss': cls_loss, 'reg_loss': reg_loss}
Beispiel #2
0
 def __init__(self, model, debug=False):
     super().__init__()
     self.criterion = FocalLoss()
     self.model = model
     self.debug = debug
Beispiel #3
0
def train(opt):
    params = Params(f'projects/{opt.project}.yml')

    if params.num_gpus == 0:
        os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

    if torch.cuda.is_available():
        torch.cuda.manual_seed(42)
    else:
        torch.manual_seed(42)

    opt.saved_path = opt.saved_path + f'/{params.project_name}/'
    opt.log_path = opt.log_path + f'/{params.project_name}/tensorboard/'
    os.makedirs(opt.log_path, exist_ok=True)
    os.makedirs(opt.saved_path, exist_ok=True)

    training_params = {
        'batch_size': opt.batch_size,
        'shuffle': True,
        'drop_last': True,
        'collate_fn': collater,
        'num_workers': opt.num_workers
    }

    val_params = {
        'batch_size': opt.batch_size,
        'shuffle': False,
        'drop_last': True,
        'collate_fn': collater,
        'num_workers': opt.num_workers
    }

    input_sizes = [512, 640, 768, 896, 1024, 1280, 1280, 1536]
    training_set = CocoDataset(root_dir=opt.data_path + params.project_name,
                               set=params.train_set,
                               transform=transforms.Compose([
                                   Normalizer(mean=params.mean,
                                              std=params.std),
                                   Augmenter(),
                                   Resizer(input_sizes[opt.compound_coef])
                               ]))
    training_generator = DataLoader(training_set, **training_params)

    val_set = CocoDataset(root_dir=opt.data_path + params.project_name,
                          set=params.val_set,
                          transform=transforms.Compose([
                              Normalizer(mean=params.mean, std=params.std),
                              Resizer(input_sizes[opt.compound_coef])
                          ]))
    val_generator = DataLoader(val_set, **val_params)

    model = EfficientDetBackbone(num_anchors=9,
                                 num_classes=len(params.obj_list),
                                 compound_coef=opt.compound_coef)

    # load last weights
    if opt.load_weights is not None:
        if opt.load_weights.endswith('.pth'):
            weights_path = opt.load_weights
        else:
            weights_path = get_last_weights(opt.saved_path)
        try:
            last_step = int(
                os.path.basename(weights_path).split('_')[-1].split('.')[0])
        except:
            last_step = 0
        model.load_state_dict(torch.load(weights_path))
        print(
            f'loaded weights: {os.path.basename(weights_path)}, resuming checkpoint from step: {last_step}'
        )
    else:
        last_step = 0
        print('initializing weights...')
        init_weights(model)

    # freeze backbone if train head_only
    if opt.head_only:

        def freeze_backbone(m):
            classname = m.__class__.__name__
            for ntl in ['EfficientNet', 'BiFPN']:
                if ntl in classname:
                    for param in m.parameters():
                        param.requires_grad = False

        model.apply(freeze_backbone)
        print('freezed backbone')

    # https://github.com/vacancy/Synchronized-BatchNorm-PyTorch
    # apply sync_bn when using multiple gpu and batch_size per gpu is lower than 4
    #  useful when gpu memory is limited.
    # because when bn is disable, the training will be very unstable or slow to converge,
    # apply sync_bn can solve it,
    # by packing all mini-batch across all gpus as one batch and normalize, then send it back to all gpus.
    # but it would also slow down the training by a little bit.
    if params.num_gpus > 1 and opt.batch_size // params.num_gpus < 4:
        model.apply(replace_w_sync_bn)

    writer = SummaryWriter(
        opt.log_path +
        f'/{datetime.datetime.now().strftime("%Y%m%d-%H%M%S")}/')

    if params.num_gpus > 0:
        model = model.cuda()
        model = CustomDataParallel(model, params.num_gpus)

    optimizer = torch.optim.AdamW(model.parameters(), opt.lr)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                           patience=3,
                                                           verbose=True)

    criterion = FocalLoss()

    best_loss = 1e5
    best_epoch = 0
    step = max(0, last_step)
    model.train()

    num_iter_per_epoch = len(training_generator)
    for epoch in range(opt.num_epochs):
        try:
            model.train()
            epoch_loss = []
            progress_bar = tqdm(training_generator)
            for iter, data in enumerate(progress_bar):
                try:
                    imgs = data['img']
                    annot = data['annot']

                    if params.num_gpus > 0:
                        annot = annot.cuda()

                    optimizer.zero_grad()
                    _, regression, classification, anchors = model(imgs)

                    cls_loss, reg_loss = criterion(
                        classification,
                        regression,
                        anchors,
                        annot,
                        # imgs=imgs, obj_list=params.obj_list  # uncomment this to debug
                    )

                    loss = cls_loss + reg_loss
                    if loss == 0 or not torch.isfinite(loss):
                        continue

                    loss.backward()
                    # torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
                    optimizer.step()

                    epoch_loss.append(float(loss))

                    progress_bar.set_description(
                        'Step: {}. Epoch: {}/{}. Iteration: {}/{}. Cls loss: {:.5f}. Reg loss: {:.5f}. Total loss: {:.5f}'
                        .format(step, epoch + 1, opt.num_epochs, iter + 1,
                                num_iter_per_epoch, cls_loss.item(),
                                reg_loss.item(), loss.item()))
                    writer.add_scalars('Loss', {'train': loss}, step)
                    writer.add_scalars('Regression_loss', {'train': reg_loss},
                                       step)
                    writer.add_scalars('Classfication_loss',
                                       {'train': cls_loss}, step)

                    # log learning_rate
                    current_lr = optimizer.param_groups[0]['lr']
                    writer.add_scalar('learning_rate', current_lr, step)

                    step += 1

                except Exception as e:
                    print(traceback.format_exc())
                    print(e)
                    continue
            scheduler.step(np.mean(epoch_loss))

            if step % opt.save_interval == 0 and step > 0:
                save_checkpoint(
                    model,
                    f'efficientdet-d{opt.compound_coef}_{epoch}_{step}.pth')

            if epoch % opt.val_interval == 0:
                model.eval()
                loss_regression_ls = []
                loss_classification_ls = []
                for iter, data in enumerate(val_generator):
                    with torch.no_grad():
                        imgs = data['img']
                        annot = data['annot']

                        if params.num_gpus > 0:
                            annot = annot.cuda()
                        _, regression, classification, anchors = model(imgs)
                        cls_loss, reg_loss = criterion(classification,
                                                       regression, anchors,
                                                       annot)

                        loss = cls_loss + reg_loss
                        if loss == 0 or not torch.isfinite(loss):
                            continue

                        loss_classification_ls.append(cls_loss.item())
                        loss_regression_ls.append(reg_loss.item())

                cls_loss = np.mean(loss_classification_ls)
                reg_loss = np.mean(loss_regression_ls)
                loss = cls_loss + reg_loss

                print(
                    'Val. Epoch: {}/{}. Classification loss: {:1.5f}. Regression loss: {:1.5f}. Total loss: {:1.5f}'
                    .format(epoch + 1, opt.num_epochs, cls_loss, reg_loss,
                            loss.mean()))
                writer.add_scalars('Total_loss', {'val': loss}, step)
                writer.add_scalars('Regression_loss', {'val': reg_loss}, step)
                writer.add_scalars('Classfication_loss', {'val': cls_loss},
                                   step)

                if loss + opt.es_min_delta < best_loss:
                    best_loss = loss
                    best_epoch = epoch

                    save_checkpoint(
                        model,
                        f'efficientdet-d{opt.compound_coef}_{epoch}_{step}.pth'
                    )

                    # onnx export is not tested.
                    # dummy_input = torch.rand(opt.batch_size, 3, 512, 512)
                    # if torch.cuda.is_available():
                    #     dummy_input = dummy_input.cuda()
                    # if isinstance(model, nn.DataParallel):
                    #     model.module.backbone_net.model.set_swish(memory_efficient=False)
                    #
                    #     torch.onnx.export(model.module, dummy_input,
                    #                       os.path.join(opt.saved_path, 'signatrix_efficientdet_coco.onnx'),
                    #                       verbose=False)
                    #     model.module.backbone_net.model.set_swish(memory_efficient=True)
                    # else:
                    #     model.backbone_net.model.set_swish(memory_efficient=False)
                    #
                    #     torch.onnx.export(model, dummy_input,
                    #                       os.path.join(opt.saved_path, 'signatrix_efficientdet_coco.onnx'),
                    #                       verbose=False)
                    #     model.backbone_net.model.set_swish(memory_efficient=True)

                # Early stopping
                if epoch - best_epoch > opt.es_patience > 0:
                    print(
                        'Stop training at epoch {}. The lowest loss achieved is {}'
                        .format(epoch, loss))
                    break
            writer.close()
        except KeyboardInterrupt:
            save_checkpoint(
                model, f'efficientdet-d{opt.compound_coef}_{epoch}_{step}.pth')
 def __init__(self, model):
     super().__init__()
     self.criterion = FocalLoss()
     self.model = model
Beispiel #5
0
 def __init__(self, model):
     super().__init__()
     self.model = model
     self.criterion = FocalLoss()
     self.cls_criterion = nn.CrossEntropyLoss()