コード例 #1
0
def train(hyp, opt, tb_writer=None):
    logger.info(
        colorstr('hyperparameters: ') + ', '.join(f'{k}={v}'
                                                  for k, v in hyp.items()))
    save_dir, epochs, batch_size, weights = Path(
        opt.save_dir), opt.epochs, opt.batch_size, opt.weights

    # Directories
    wdir = save_dir / 'weights'
    wdir.mkdir(parents=True, exist_ok=True)  # make dir
    last = wdir / 'last.pkl'
    best = wdir / 'best.pkl'
    results_file = save_dir / 'results.txt'

    # Save run settings
    with open(save_dir / 'hyp.yaml', 'w') as f:
        yaml.dump(hyp, f, sort_keys=False)
    with open(save_dir / 'opt.yaml', 'w') as f:
        yaml.dump(vars(opt), f, sort_keys=False)

    # Configure
    plots = not opt.evolve  # create plots
    cuda = not opt.no_cuda
    if cuda:
        jt.flags.use_cuda = 1

    init_seeds(1)
    with open(opt.data) as f:
        data_dict = yaml.load(f, Loader=yaml.SafeLoader)  # data dict

    check_dataset(data_dict)  # check
    train_path = data_dict['train']
    test_path = data_dict['val']
    nc = 1 if opt.single_cls else int(data_dict['nc'])  # number of classes
    names = ['item'] if opt.single_cls and len(
        data_dict['names']) != 1 else data_dict['names']  # class names
    assert len(names) == nc, '%g names found for nc=%g dataset in %s' % (
        len(names), nc, opt.data)  # check

    # Model
    model = Model(opt.cfg, ch=3, nc=nc)  # create
    pretrained = weights.endswith('.pkl')
    if pretrained:
        model.load(weights)  # load

    # Optimizer
    nbs = 64  # nominal batch size
    accumulate = max(round(nbs / batch_size),
                     1)  # accumulate loss before optimizing
    hyp['weight_decay'] *= batch_size * accumulate / nbs  # scale weight_decay
    logger.info(f"Scaled weight_decay = {hyp['weight_decay']}")

    pg0, pg1, pg2 = [], [], []  # optimizer parameter groups
    for k, v in model.named_modules():
        if hasattr(v, 'bias') and isinstance(v.bias, jt.Var):
            pg2.append(v.bias)  # biases
        if isinstance(v, nn.BatchNorm):
            pg0.append(v.weight)  # no decay
        elif hasattr(v, 'weight') and isinstance(v.weight, jt.Var):
            pg1.append(v.weight)  # apply decay

    if opt.adam:
        optimizer = optim.Adam(pg0,
                               lr=hyp['lr0'],
                               betas=(hyp['momentum'],
                                      0.999))  # adjust beta1 to momentum
    else:
        optimizer = optim.SGD(pg0,
                              lr=hyp['lr0'],
                              momentum=hyp['momentum'],
                              nesterov=True)

    optimizer.add_param_group({
        'params': pg1,
        'weight_decay': hyp['weight_decay']
    })  # add pg1 with weight_decay
    optimizer.add_param_group({'params': pg2})  # add pg2 (biases)
    logger.info('Optimizer groups: %g .bias, %g conv.weight, %g other' %
                (len(pg2), len(pg1), len(pg0)))
    del pg0, pg1, pg2

    # Scheduler https://arxiv.org/pdf/1812.01187.pdf
    # https://pytorch.org/docs/stable/_modules/torch/optim/lr_scheduler.html#OneCycleLR
    lf = one_cycle(1, hyp['lrf'], epochs)  # cosine 1->hyp['lrf']
    scheduler = optim.LambdaLR(optimizer, lr_lambda=lf)
    # plot_lr_scheduler(optimizer, scheduler, epochs)

    loggers = {}  # loggers dict

    start_epoch, best_fitness = 0, 0.0

    # Image sizes
    gs = int(model.stride.max())  # grid size (max stride)
    nl = model.model[
        -1].nl  # number of detection layers (used for scaling hyp['obj'])
    imgsz, imgsz_test = [check_img_size(x, gs) for x in opt.img_size
                         ]  # verify imgsz are gs-multiples

    # EMA
    ema = ModelEMA(model)

    # Trainloader
    dataloader = create_dataloader(train_path,
                                   imgsz,
                                   batch_size,
                                   gs,
                                   opt,
                                   hyp=hyp,
                                   augment=True,
                                   cache=opt.cache_images,
                                   rect=opt.rect,
                                   workers=opt.workers,
                                   image_weights=opt.image_weights,
                                   quad=opt.quad,
                                   prefix=colorstr('train: '))

    mlc = np.concatenate(dataloader.labels, 0)[:, 0].max()  # max label class
    nb = len(dataloader)  # number of batches
    assert mlc < nc, 'Label class %g exceeds nc=%g in %s. Possible class labels are 0-%g' % (
        mlc, nc, opt.data, nc - 1)

    ema.updates = start_epoch * nb // accumulate  # set EMA updates
    testloader = create_dataloader(
        test_path,
        imgsz_test,
        batch_size,
        gs,
        opt,  # testloader
        hyp=hyp,
        cache=opt.cache_images and not opt.notest,
        rect=True,
        workers=opt.workers,
        pad=0.5,
        prefix=colorstr('val: '))

    labels = np.concatenate(dataloader.labels, 0)
    c = jt.array(labels[:, 0])  # classes

    # cf = torch.bincount(c.int(), minlength=nc) + 1.  # frequency
    # model._initialize_biases(cf)
    if plots:
        plot_labels(labels, save_dir, loggers)
        if tb_writer:
            tb_writer.add_histogram('classes', c.numpy(), 0)

    # Anchors
    if not opt.noautoanchor:
        check_anchors(dataloader,
                      model=model,
                      thr=hyp['anchor_t'],
                      imgsz=imgsz)

    # Model parameters
    hyp['box'] *= 3. / nl  # scale to layers
    hyp['cls'] *= nc / 80. * 3. / nl  # scale to classes and layers
    hyp['obj'] *= (imgsz / 640)**2 * 3. / nl  # scale to image size and layers
    model.nc = nc  # attach number of classes to model
    model.hyp = hyp  # attach hyperparameters to model
    model.gr = 1.0  # iou loss ratio (obj_loss = 1.0 or iou)
    model.class_weights = labels_to_class_weights(
        dataloader.labels, nc) * nc  # attach class weights
    model.names = names
    # Start training
    t0 = time.time()
    nw = max(round(hyp['warmup_epochs'] * nb),
             1000)  # number of warmup iterations, max(3 epochs, 1k iterations)
    # nw = min(nw, (epochs - start_epoch) / 2 * nb)  # limit warmup to < 1/2 of training
    maps = np.zeros(nc)  # mAP per class
    results = (0, 0, 0, 0, 0, 0, 0
               )  # P, R, [email protected], [email protected], val_loss(box, obj, cls)
    scheduler.last_epoch = start_epoch - 1  # do not move
    logger.info(f'Image sizes {imgsz} train, {imgsz_test} test\n'
                f'Using {dataloader.num_workers} dataloader workers\n'
                f'Logging results to {save_dir}\n'
                f'Starting training for {epochs} epochs...')
    for epoch in range(
            start_epoch, epochs
    ):  # epoch ------------------------------------------------------------------
        model.train()

        # Update image weights (optional)
        if opt.image_weights:
            # Generate indices
            cw = model.class_weights.numpy() * (1 -
                                                maps)**2 / nc  # class weights
            iw = labels_to_image_weights(dataloader.labels,
                                         nc=nc,
                                         class_weights=cw)  # image weights
            dataloader.indices = random.choices(
                range(dataloader.n), weights=iw,
                k=dataloader.n)  # rand weighted idx

        # Update mosaic border
        # b = int(random.uniform(0.25 * imgsz, 0.75 * imgsz + gs) // gs * gs)
        # dataset.mosaic_border = [b - imgsz, -b]  # height, width borders

        mloss = jt.zeros((4, ))  # mean losses
        pbar = enumerate(dataloader)
        logger.info(
            ('\n' + '%10s' * 7) %
            ('Epoch', 'box', 'obj', 'cls', 'total', 'targets', 'img_size'))
        pbar = tqdm(pbar, total=nb)  # progress bar
        for i, (
                imgs, targets, paths, _
        ) in pbar:  # batch -------------------------------------------------------------
            ni = i + nb * epoch  # number integrated batches (since train start)
            imgs = imgs.float() / 255.0  # uint8 to float32, 0-255 to 0.0-1.0

            # Warmup
            if ni <= nw:
                xi = [0, nw]  # x interp
                # model.gr = np.interp(ni, xi, [0.0, 1.0])  # iou loss ratio (obj_loss = 1.0 or iou)
                # accumulate = max(1, np.interp(ni, xi, [1, nbs / batch_size]).round())

                for j, x in enumerate(optimizer.param_groups):
                    # bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0
                    x['lr'] = np.interp(ni, xi, [
                        hyp['warmup_bias_lr'] if j == 2 else 0.0,
                        x['initial_lr'] * lf(epoch)
                    ])
                    if 'momentum' in x:
                        x['momentum'] = np.interp(
                            ni, xi, [hyp['warmup_momentum'], hyp['momentum']])

            # Multi-scale
            if opt.multi_scale:
                sz = random.randrange(imgsz * 0.5,
                                      imgsz * 1.5 + gs) // gs * gs  # size
                sf = sz / max(imgs.shape[2:])  # scale factor
                if sf != 1:
                    ns = [math.ceil(x * sf / gs) * gs for x in imgs.shape[2:]
                          ]  # new shape (stretched to gs-multiple)
                    imgs = nn.interpolate(imgs,
                                          size=ns,
                                          mode='bilinear',
                                          align_corners=False)
            # Forward
            pred = model(imgs)  # forward
            loss, loss_items = compute_loss(pred, targets,
                                            model)  # loss scaled by batch_size
            if opt.quad:
                loss *= 4.

            # Optimize
            optimizer.step(loss)
            if ema:
                ema.update(model)

            # Print
            mloss = (mloss * i + loss_items) / (i + 1)  # update mean losses
            s = ('%10s' + '%10.4g' * 6) % ('%g/%g' %
                                           (epoch, epochs - 1), *mloss,
                                           targets.shape[0], imgs.shape[-1])
            pbar.set_description(s)

            # Plot
            if plots and ni < 3:
                f = save_dir / f'train_batch{ni}.jpg'  # filename
                Thread(target=plot_images,
                       args=(imgs, targets, paths, f),
                       daemon=True).start()
                # if tb_writer:
                #     tb_writer.add_image(f, result, dataformats='HWC', global_step=epoch)
                #     tb_writer.add_graph(model, imgs)  # add model to tensorboard

            # end batch ------------------------------------------------------------------------------------------------
        # end epoch ----------------------------------------------------------------------------------------------------

        # Scheduler
        lr = [x['lr'] for x in optimizer.param_groups]  # for tensorboard
        scheduler.step()

        # mAP
        if ema:
            ema.update_attr(model,
                            include=[
                                'yaml', 'nc', 'hyp', 'gr', 'names', 'stride',
                                'class_weights'
                            ])
        final_epoch = epoch + 1 == epochs
        if not opt.notest or final_epoch:  # Calculate mAP
            results, maps, times = test.test(data=opt.data,
                                             batch_size=batch_size,
                                             imgsz=imgsz_test,
                                             model=ema.ema,
                                             single_cls=opt.single_cls,
                                             dataloader=testloader,
                                             save_dir=save_dir,
                                             plots=plots and final_epoch)

        # Write
        with open(results_file, 'a') as f:
            f.write(s + '%10.4g' * 7 % results +
                    '\n')  # P, R, [email protected], [email protected], val_loss(box, obj, cls)
        if len(opt.name) and opt.bucket:
            os.system('gsutil cp %s gs://%s/results/results%s.txt' %
                      (results_file, opt.bucket, opt.name))

        # Log
        tags = [
            'train/box_loss',
            'train/obj_loss',
            'train/cls_loss',  # train loss
            'metrics/precision',
            'metrics/recall',
            'metrics/mAP_0.5',
            'metrics/mAP_0.5-0.95',
            'val/box_loss',
            'val/obj_loss',
            'val/cls_loss',  # val loss
            'x/lr0',
            'x/lr1',
            'x/lr2'
        ]  # params
        for x, tag in zip(list(mloss[:-1]) + list(results) + lr, tags):
            if tb_writer:
                if hasattr(x, "numpy"):
                    x = x.numpy()
                tb_writer.add_scalar(tag, x, epoch)  # tensorboard

        # Update best mAP
        fi = fitness(np.array(results).reshape(
            1, -1))  # weighted combination of [P, R, [email protected], [email protected]]
        if fi > best_fitness:
            best_fitness = fi

        # Save model
        save = (not opt.nosave) or (final_epoch and not opt.evolve)
        if save:
            # Save last, best and delete
            jt.save(ema.ema.state_dict(), last)
            if best_fitness == fi:
                jt.save(ema.ema.state_dict(), best)
        # end epoch ----------------------------------------------------------------------------------------------------
    # end training
    # Strip optimizers
    final = best if best.exists() else last  # final model
    if opt.bucket:
        os.system(f'gsutil cp {final} gs://{opt.bucket}/weights')  # upload

    # Plots
    if plots:
        plot_results(save_dir=save_dir)  # save as results.png

    # Test best.pkl
    logger.info('%g epochs completed in %.3f hours.\n' %
                (epoch - start_epoch + 1, (time.time() - t0) / 3600))
    best_model = Model(opt.cfg)
    best_model.load(str(final))
    best_model = best_model.fuse()
    if opt.data.endswith('coco.yaml') and nc == 80:  # if COCO
        for conf, iou, save_json in ([0.25, 0.45,
                                      False], [0.001, 0.65,
                                               True]):  # speed, mAP tests
            results, _, _ = test.test(opt.data,
                                      batch_size=total_batch_size,
                                      imgsz=imgsz_test,
                                      conf_thres=conf,
                                      iou_thres=iou,
                                      model=best_model,
                                      single_cls=opt.single_cls,
                                      dataloader=testloader,
                                      save_dir=save_dir,
                                      save_json=save_json,
                                      plots=False)

    return results
コード例 #2
0
 def __init__(self, cfg_path):
     with open(cfg_path, 'r') as rf:
         self.cfg = yaml.safe_load(rf)
     self.data_cfg = self.cfg['data']
     self.model_cfg = self.cfg['model']
     self.optim_cfg = self.cfg['optim']
     self.val_cfg = self.cfg['val']
     print(self.data_cfg)
     print(self.model_cfg)
     print(self.optim_cfg)
     print(self.val_cfg)
     os.environ['CUDA_VISIBLE_DEVICES'] = self.cfg['gpus']
     self.gpu_num = len(str(self.cfg['gpus']).split(","))
     dist.init_process_group(backend='nccl')
     self.tdata = CustomerDataSets(json_path=self.data_cfg['train_json_path'],
                                   debug=self.data_cfg['debug'],
                                   augment=True,
                                   )
     self.tloader = DataLoader(dataset=self.tdata,
                               batch_size=self.data_cfg['batch_size'],
                               num_workers=self.data_cfg['num_workers'],
                               collate_fn=self.tdata.collate_fn,
                               sampler=DistributedSampler(dataset=self.tdata, shuffle=True))
     self.vdata = CustomerDataSets(json_path=self.data_cfg['val_json_path'],
                                   debug=self.data_cfg['debug'],
                                   augment=False,
                                   )
     self.vloader = DataLoader(dataset=self.vdata,
                               batch_size=self.data_cfg['batch_size'],
                               num_workers=self.data_cfg['num_workers'],
                               collate_fn=self.vdata.collate_fn,
                               sampler=DistributedSampler(dataset=self.vdata, shuffle=False))
     print("train_data: ", len(self.tdata), " | ",
           "val_data: ", len(self.vdata))
     print("train_iter: ", len(self.tloader), " | ",
           "val_iter: ", len(self.vloader))
     if self.cfg['model_name'] == "v4":
         net = YOLOv4
     elif self.cfg['model_name'] == "v5":
         net = YOLOv5
     else:
         raise NotImplementedError("{:s} not supported yet".format(self.cfg['model_name']))
     model = net(num_cls=self.model_cfg['num_cls'],
                 anchors=self.model_cfg['anchors'],
                 strides=self.model_cfg['strides'],
                 scale_name=self.model_cfg['scale_name'],
                 )
     self.best_map = 0.
     optimizer = split_optimizer(model, self.optim_cfg)
     local_rank = dist.get_rank()
     self.local_rank = local_rank
     self.device = torch.device("cuda", local_rank)
     model.to(self.device)
     self.scaler = amp.GradScaler(enabled=True)
     if self.optim_cfg['sync_bn']:
         model = nn.SyncBatchNorm.convert_sync_batchnorm(model)
     self.model = nn.parallel.distributed.DistributedDataParallel(model,
                                                                  device_ids=[local_rank],
                                                                  output_device=local_rank)
     self.optimizer = optimizer
     self.ema = ModelEMA(self.model)
     self.lr_adjuster = IterWarmUpCosineDecayMultiStepLRAdjust(init_lr=self.optim_cfg['lr'],
                                                               warm_up_epoch=self.optim_cfg['warm_up_epoch'],
                                                               iter_per_epoch=len(self.tloader),
                                                               epochs=self.optim_cfg['epochs'],
                                                               alpha=self.optim_cfg['alpha'],
                                                               gamma=self.optim_cfg['gamma'],
                                                               bias_idx=2,
                                                               milestones=self.optim_cfg['milestones']
                                                               )
     self.obj_logger = AverageLogger()
     self.iou_logger = AverageLogger()
     self.loss_logger = AverageLogger()
     self.map_logger = AverageLogger()
コード例 #3
0
 def __init__(self, cfg_path):
     with open(cfg_path, 'r') as rf:
         self.cfg = yaml.safe_load(rf)
     self.data_cfg = self.cfg['data']
     self.model_cfg = self.cfg['model']
     self.optim_cfg = self.cfg['optim']
     self.val_cfg = self.cfg['val']
     print(self.data_cfg)
     print(self.model_cfg)
     print(self.optim_cfg)
     print(self.val_cfg)
     os.environ['CUDA_VISIBLE_DEVICES'] = self.cfg['gpus']
     self.gpu_num = len(self.cfg['gpus'].split(','))
     dist.init_process_group(backend='nccl')
     self.tdata = COCODataSets(
         img_root=self.data_cfg['train_img_root'],
         annotation_path=self.data_cfg['train_annotation_path'],
         max_thresh=self.data_cfg['max_thresh'],
         debug=self.data_cfg['debug'],
         use_crowd=self.data_cfg['use_crowd'],
         augments=True,
         remove_blank=self.data_cfg['remove_blank'])
     self.tloader = DataLoader(dataset=self.tdata,
                               batch_size=self.data_cfg['batch_size'],
                               num_workers=self.data_cfg['num_workers'],
                               collate_fn=self.tdata.collect_fn,
                               sampler=DistributedSampler(
                                   dataset=self.tdata, shuffle=True))
     self.vdata = COCODataSets(
         img_root=self.data_cfg['val_img_root'],
         annotation_path=self.data_cfg['val_annotation_path'],
         max_thresh=self.data_cfg['max_thresh'],
         debug=self.data_cfg['debug'],
         use_crowd=self.data_cfg['use_crowd'],
         augments=False,
         remove_blank=False)
     self.vloader = DataLoader(dataset=self.vdata,
                               batch_size=self.data_cfg['batch_size'],
                               num_workers=self.data_cfg['num_workers'],
                               collate_fn=self.vdata.collect_fn,
                               sampler=DistributedSampler(
                                   dataset=self.vdata, shuffle=False))
     print("train_data: ", len(self.tdata), " | ", "val_data: ",
           len(self.vdata), " | ", "empty_data: ",
           self.tdata.empty_images_len)
     print("train_iter: ", len(self.tloader), " | ", "val_iter: ",
           len(self.vloader))
     model = SparseRCNN(**self.model_cfg)
     self.best_map = 0.
     optimizer = split_optimizer_v2(model, self.optim_cfg)
     local_rank = dist.get_rank()
     self.local_rank = local_rank
     self.device = torch.device("cuda", local_rank)
     model.to(self.device)
     if self.optim_cfg['sync_bn']:
         model = nn.SyncBatchNorm.convert_sync_batchnorm(model)
     self.model = nn.parallel.distributed.DistributedDataParallel(
         model, device_ids=[local_rank], output_device=local_rank)
     self.scaler = amp.GradScaler(
         enabled=True) if self.optim_cfg['amp'] else None
     self.optimizer = optimizer
     self.ema = ModelEMA(self.model)
     self.lr_adjuster = IterWarmUpMultiStepDecay(
         init_lr=self.optim_cfg['lr'],
         milestones=self.optim_cfg['milestones'],
         warm_up_iter=self.optim_cfg['warm_up_iter'],
         iter_per_epoch=len(self.tloader),
         epochs=self.optim_cfg['epochs'],
         alpha=self.optim_cfg['alpha'],
         warm_up_factor=self.optim_cfg['warm_up_factor'])
     self.cls_loss_logger = AverageLogger()
     self.l1_loss_logger = AverageLogger()
     self.iou_loss_logger = AverageLogger()
     self.match_num_logger = AverageLogger()
     self.loss_logger = AverageLogger()
コード例 #4
0
class DDPMixSolver(object):
    def __init__(self, cfg_path):
        with open(cfg_path, 'r') as rf:
            self.cfg = yaml.safe_load(rf)
        self.data_cfg = self.cfg['data']
        self.model_cfg = self.cfg['model']
        self.optim_cfg = self.cfg['optim']
        self.val_cfg = self.cfg['val']
        print(self.data_cfg)
        print(self.model_cfg)
        print(self.optim_cfg)
        print(self.val_cfg)
        os.environ['CUDA_VISIBLE_DEVICES'] = self.cfg['gpus']
        self.gpu_num = len(str(self.cfg['gpus']).split(","))
        dist.init_process_group(backend='nccl')
        self.tdata = CustomerDataSets(json_path=self.data_cfg['train_json_path'],
                                      debug=self.data_cfg['debug'],
                                      augment=True,
                                      )
        self.tloader = DataLoader(dataset=self.tdata,
                                  batch_size=self.data_cfg['batch_size'],
                                  num_workers=self.data_cfg['num_workers'],
                                  collate_fn=self.tdata.collate_fn,
                                  sampler=DistributedSampler(dataset=self.tdata, shuffle=True))
        self.vdata = CustomerDataSets(json_path=self.data_cfg['val_json_path'],
                                      debug=self.data_cfg['debug'],
                                      augment=False,
                                      )
        self.vloader = DataLoader(dataset=self.vdata,
                                  batch_size=self.data_cfg['batch_size'],
                                  num_workers=self.data_cfg['num_workers'],
                                  collate_fn=self.vdata.collate_fn,
                                  sampler=DistributedSampler(dataset=self.vdata, shuffle=False))
        print("train_data: ", len(self.tdata), " | ",
              "val_data: ", len(self.vdata))
        print("train_iter: ", len(self.tloader), " | ",
              "val_iter: ", len(self.vloader))
        if self.cfg['model_name'] == "v4":
            net = YOLOv4
        elif self.cfg['model_name'] == "v5":
            net = YOLOv5
        else:
            raise NotImplementedError("{:s} not supported yet".format(self.cfg['model_name']))
        model = net(num_cls=self.model_cfg['num_cls'],
                    anchors=self.model_cfg['anchors'],
                    strides=self.model_cfg['strides'],
                    scale_name=self.model_cfg['scale_name'],
                    )
        self.best_map = 0.
        optimizer = split_optimizer(model, self.optim_cfg)
        local_rank = dist.get_rank()
        self.local_rank = local_rank
        self.device = torch.device("cuda", local_rank)
        model.to(self.device)
        self.scaler = amp.GradScaler(enabled=True)
        if self.optim_cfg['sync_bn']:
            model = nn.SyncBatchNorm.convert_sync_batchnorm(model)
        self.model = nn.parallel.distributed.DistributedDataParallel(model,
                                                                     device_ids=[local_rank],
                                                                     output_device=local_rank)
        self.optimizer = optimizer
        self.ema = ModelEMA(self.model)
        self.lr_adjuster = IterWarmUpCosineDecayMultiStepLRAdjust(init_lr=self.optim_cfg['lr'],
                                                                  warm_up_epoch=self.optim_cfg['warm_up_epoch'],
                                                                  iter_per_epoch=len(self.tloader),
                                                                  epochs=self.optim_cfg['epochs'],
                                                                  alpha=self.optim_cfg['alpha'],
                                                                  gamma=self.optim_cfg['gamma'],
                                                                  bias_idx=2,
                                                                  milestones=self.optim_cfg['milestones']
                                                                  )
        self.obj_logger = AverageLogger()
        self.iou_logger = AverageLogger()
        self.loss_logger = AverageLogger()
        self.map_logger = AverageLogger()

    def train(self, epoch):
        self.obj_logger.reset()
        self.iou_logger.reset()
        self.loss_logger.reset()
        self.model.train()
        if self.local_rank == 0:
            pbar = tqdm(self.tloader)
        else:
            pbar = self.tloader
        for i, (img_tensor, targets_tensor) in enumerate(pbar):
            with torch.no_grad():
                if len(self.data_cfg['multi_scale']) > 2:
                    target_size = np.random.choice(self.data_cfg['multi_scale'])
                    img_tensor = interpolate(img_tensor, mode='bilinear', size=target_size, align_corners=False)
                _, _, h, w = img_tensor.shape
                img_tensor = img_tensor.to(self.device)
                targets_tensor = targets_tensor.to(self.device)
            self.optimizer.zero_grad()
            with amp.autocast(enabled=True):
                ret = self.model(img_tensor, targets_tensor)
                obj_loss = ret['obj_loss']
                iou_loss = ret['iou_loss']
                loss = obj_loss + iou_loss
            self.scaler.scale(loss).backward()
            self.lr_adjuster(self.optimizer, i, epoch)
            ulr = self.optimizer.param_groups[0]['lr']
            dlr = self.optimizer.param_groups[2]['lr']
            self.scaler.step(self.optimizer)
            self.scaler.update()
            self.ema.update(self.model)
            self.obj_logger.update(obj_loss.item())
            self.iou_logger.update(iou_loss.item())
            self.loss_logger.update(loss.item())
            if self.local_rank == 0:
                pbar.set_description(
                    "epoch:{:2d}|size:{:3d}|loss:{:6.4f}|obj_loss:{:6.4f}|iou_loss:{:6.4f}|ulr:{:8.6f},dlr:{:8.6f}".format(
                        epoch + 1,
                        h,
                        self.loss_logger.avg(),
                        obj_loss.item(),
                        iou_loss.item(),
                        ulr,
                        dlr
                    ))
        self.ema.update_attr(self.model)
        print(
            "epoch:{:3d}|local:{:3d}|loss:{:6.4f}||obj_loss:{:6.4f}|iou_loss:{:6.4f}".format(epoch + 1,
                                                                                             self.local_rank,
                                                                                             self.loss_logger.avg(),
                                                                                             self.obj_logger.avg(),
                                                                                             self.iou_logger.avg(),
                                                                                             )
        )

    @torch.no_grad()
    def val(self, epoch):
        self.model.eval()
        self.ema.ema.eval()
        predict_list = list()
        target_list = list()
        if self.local_rank == 0:
            pbar = tqdm(self.vloader)
        else:
            pbar = self.vloader
        for img_tensor, targets_tensor in pbar:
            _, _, h, w = img_tensor.shape
            targets_tensor[:, 1:] = targets_tensor[:, 1:] * torch.tensor(data=[w, h, w, h])
            targets_tensor[:, [1, 2]] = targets_tensor[:, [1, 2]] - targets_tensor[:, [3, 4]] * 0.5
            targets_tensor[:, [3, 4]] = targets_tensor[:, [1, 2]] + targets_tensor[:, [3, 4]]
            img_tensor = img_tensor.to(self.device)
            targets_tensor = targets_tensor.to(self.device)
            predicts = self.ema.ema(img_tensor)['predicts']
            for i, pred in enumerate(predicts):
                if pred is not None:
                    pred = torch.cat([pred, torch.zeros_like(pred[..., [0]])], dim=-1)
                predict_list.append(pred)
                targets_sample = targets_tensor[targets_tensor[:, 0] == i][:, 1:]
                targets_sample = torch.cat([torch.zeros_like(targets_sample[..., [0]]), targets_sample], dim=-1)
                target_list.append(targets_sample)
        mp, mr, map50, map = coco_map(predict_list, target_list)
        mp = reduce_sum(torch.tensor(mp, device=self.device)).item() / self.gpu_num
        mr = reduce_sum(torch.tensor(mr, device=self.device)).item() / self.gpu_num
        map50 = reduce_sum(torch.tensor(map50, device=self.device)).item() / self.gpu_num
        map = reduce_sum(torch.tensor(map, device=self.device)).item() / self.gpu_num
        if self.local_rank == 0:
            print("epoch: {:2d}|gpu_num:{:d}|mp:{:6.4f}|mr:{:6.4f}|map50:{:6.4f}|map:{:6.4f}"
                  .format(epoch + 1,
                          self.gpu_num,
                          mp * 100,
                          mr * 100,
                          map50 * 100,
                          map * 100))
        last_weight_path = os.path.join(self.val_cfg['weight_path'],
                                        "{:s}_{:s}_last.pth"
                                        .format(self.cfg['model_name'], self.model_cfg['scale_name']))
        best_map_weight_path = os.path.join(self.val_cfg['weight_path'],
                                            "{:s}_{:s}_best_map.pth"
                                            .format(self.cfg['model_name'], self.model_cfg['scale_name']))
        ema_static = self.ema.ema.state_dict()
        cpkt = {
            "ema": ema_static,
            "map": map * 100,
            "epoch": epoch,
        }
        if self.local_rank != 0:
            return
        torch.save(cpkt, last_weight_path)
        if map > self.best_map:
            torch.save(cpkt, best_map_weight_path)
            self.best_map = map

    def run(self):
        for epoch in range(self.optim_cfg['epochs']):
            self.train(epoch)
            if (epoch + 1) % self.val_cfg['interval'] == 0:
                self.val(epoch)
        dist.destroy_process_group()
        torch.cuda.empty_cache()
コード例 #5
0
class DDPMixSolver(object):
    def __init__(self, cfg_path):
        with open(cfg_path, 'r') as rf:
            self.cfg = yaml.safe_load(rf)
        self.data_cfg = self.cfg['data']
        self.model_cfg = self.cfg['model']
        self.optim_cfg = self.cfg['optim']
        self.val_cfg = self.cfg['val']
        print(self.data_cfg)
        print(self.model_cfg)
        print(self.optim_cfg)
        print(self.val_cfg)
        os.environ['CUDA_VISIBLE_DEVICES'] = self.cfg['gpus']
        self.gpu_num = len(self.cfg['gpus'].split(','))
        dist.init_process_group(backend='nccl')
        self.tdata = COCODataSets(
            img_root=self.data_cfg['train_img_root'],
            annotation_path=self.data_cfg['train_annotation_path'],
            max_thresh=self.data_cfg['max_thresh'],
            debug=self.data_cfg['debug'],
            use_crowd=self.data_cfg['use_crowd'],
            augments=True,
            remove_blank=self.data_cfg['remove_blank'])
        self.tloader = DataLoader(dataset=self.tdata,
                                  batch_size=self.data_cfg['batch_size'],
                                  num_workers=self.data_cfg['num_workers'],
                                  collate_fn=self.tdata.collect_fn,
                                  sampler=DistributedSampler(
                                      dataset=self.tdata, shuffle=True))
        self.vdata = COCODataSets(
            img_root=self.data_cfg['val_img_root'],
            annotation_path=self.data_cfg['val_annotation_path'],
            max_thresh=self.data_cfg['max_thresh'],
            debug=self.data_cfg['debug'],
            use_crowd=self.data_cfg['use_crowd'],
            augments=False,
            remove_blank=False)
        self.vloader = DataLoader(dataset=self.vdata,
                                  batch_size=self.data_cfg['batch_size'],
                                  num_workers=self.data_cfg['num_workers'],
                                  collate_fn=self.vdata.collect_fn,
                                  sampler=DistributedSampler(
                                      dataset=self.vdata, shuffle=False))
        print("train_data: ", len(self.tdata), " | ", "val_data: ",
              len(self.vdata), " | ", "empty_data: ",
              self.tdata.empty_images_len)
        print("train_iter: ", len(self.tloader), " | ", "val_iter: ",
              len(self.vloader))
        model = SparseRCNN(**self.model_cfg)
        self.best_map = 0.
        optimizer = split_optimizer_v2(model, self.optim_cfg)
        local_rank = dist.get_rank()
        self.local_rank = local_rank
        self.device = torch.device("cuda", local_rank)
        model.to(self.device)
        if self.optim_cfg['sync_bn']:
            model = nn.SyncBatchNorm.convert_sync_batchnorm(model)
        self.model = nn.parallel.distributed.DistributedDataParallel(
            model, device_ids=[local_rank], output_device=local_rank)
        self.scaler = amp.GradScaler(
            enabled=True) if self.optim_cfg['amp'] else None
        self.optimizer = optimizer
        self.ema = ModelEMA(self.model)
        self.lr_adjuster = IterWarmUpMultiStepDecay(
            init_lr=self.optim_cfg['lr'],
            milestones=self.optim_cfg['milestones'],
            warm_up_iter=self.optim_cfg['warm_up_iter'],
            iter_per_epoch=len(self.tloader),
            epochs=self.optim_cfg['epochs'],
            alpha=self.optim_cfg['alpha'],
            warm_up_factor=self.optim_cfg['warm_up_factor'])
        self.cls_loss_logger = AverageLogger()
        self.l1_loss_logger = AverageLogger()
        self.iou_loss_logger = AverageLogger()
        self.match_num_logger = AverageLogger()
        self.loss_logger = AverageLogger()
        # if self.local_rank == 0:
        #     print(self.model)

    def train(self, epoch):
        self.loss_logger.reset()
        self.cls_loss_logger.reset()
        self.l1_loss_logger.reset()
        self.iou_loss_logger.reset()
        self.match_num_logger.reset()
        self.model.train()
        if self.local_rank == 0:
            pbar = tqdm(self.tloader)
        else:
            pbar = self.tloader
        for i, (img_tensor, targets_tensor, batch_len) in enumerate(pbar):
            _, _, h, w = img_tensor.shape
            with torch.no_grad():
                img_tensor = img_tensor.to(self.device)
                targets_tensor = targets_tensor.to(self.device)
            self.optimizer.zero_grad()
            if self.scaler is not None:
                with amp.autocast(enabled=True):
                    out = self.model(img_tensor,
                                     targets={
                                         "target": targets_tensor,
                                         "batch_len": batch_len
                                     })
                    cls_loss = out['cls_loss']
                    l1_loss = out['l1_loss']
                    iou_loss = out['iou_loss']
                    match_num = out['match_num']
                    loss = cls_loss + l1_loss + iou_loss
                    self.scaler.scale(loss).backward()
                    self.lr_adjuster(self.optimizer, i, epoch)
                    self.scaler.step(self.optimizer)
                    self.scaler.update()
            else:
                out = self.model(img_tensor,
                                 targets={
                                     "target": targets_tensor,
                                     "batch_len": batch_len
                                 })
                cls_loss = out['cls_loss']
                l1_loss = out['l1_loss']
                iou_loss = out['iou_loss']
                match_num = out['match_num']
                loss = cls_loss + l1_loss + iou_loss
                loss.backward()
                self.lr_adjuster(self.optimizer, i, epoch)
                self.optimizer.step()
            self.ema.update(self.model)
            lr = self.optimizer.param_groups[0]['lr']
            self.loss_logger.update(loss.item())
            self.iou_loss_logger.update(iou_loss.item())
            self.l1_loss_logger.update(l1_loss.item())
            self.cls_loss_logger.update(cls_loss.item())
            self.match_num_logger.update(match_num)
            str_template = \
                "epoch:{:2d}|match_num:{:0>4d}|size:{:3d}|loss:{:6.4f}|cls:{:6.4f}|l1:{:6.4f}|iou:{:6.4f}|lr:{:8.6f}"
            if self.local_rank == 0:
                pbar.set_description(
                    str_template.format(epoch + 1, int(match_num), h,
                                        self.loss_logger.avg(),
                                        self.cls_loss_logger.avg(),
                                        self.l1_loss_logger.avg(),
                                        self.iou_loss_logger.avg(), lr))
        self.ema.update_attr(self.model)
        loss_avg = reduce_sum(
            torch.tensor(self.loss_logger.avg(),
                         device=self.device)) / self.gpu_num
        iou_loss_avg = reduce_sum(
            torch.tensor(self.iou_loss_logger.avg(),
                         device=self.device)).item() / self.gpu_num
        l1_loss_avg = reduce_sum(
            torch.tensor(self.l1_loss_logger.avg(),
                         device=self.device)).item() / self.gpu_num
        cls_loss_avg = reduce_sum(
            torch.tensor(self.cls_loss_logger.avg(),
                         device=self.device)).item() / self.gpu_num
        match_num_sum = reduce_sum(
            torch.tensor(self.match_num_logger.sum(),
                         device=self.device)).item() / self.gpu_num
        if self.local_rank == 0:
            final_template = "epoch:{:2d}|match_num:{:d}|loss:{:6.4f}|cls:{:6.4f}|l1:{:6.4f}|iou:{:6.4f}"
            print(
                final_template.format(epoch + 1, int(match_num_sum), loss_avg,
                                      cls_loss_avg, l1_loss_avg, iou_loss_avg))

    @torch.no_grad()
    def val(self, epoch):
        predict_list = list()
        target_list = list()
        self.model.eval()
        self.ema.ema.eval()
        if self.local_rank == 0:
            pbar = tqdm(self.vloader)
        else:
            pbar = self.vloader
        for img_tensor, targets_tensor, batch_len in pbar:
            img_tensor = img_tensor.to(self.device)
            targets_tensor = targets_tensor.to(self.device)
            predicts = self.ema.ema(img_tensor)['predicts']
            for pred, target in zip(predicts, targets_tensor.split(batch_len)):
                predict_list.append(pred)
                target_list.append(target)
        mp, mr, map50, mean_ap = coco_map(predict_list, target_list)
        mp = reduce_sum(torch.tensor(mp, device=self.device)) / self.gpu_num
        mr = reduce_sum(torch.tensor(mr, device=self.device)) / self.gpu_num
        map50 = reduce_sum(torch.tensor(map50,
                                        device=self.device)) / self.gpu_num
        mean_ap = reduce_sum(torch.tensor(mean_ap,
                                          device=self.device)) / self.gpu_num

        if self.local_rank == 0:
            print("*" * 20, "eval start", "*" * 20)
            print(
                "epoch: {:2d}|mp:{:6.4f}|mr:{:6.4f}|map50:{:6.4f}|map:{:6.4f}".
                format(epoch + 1, mp * 100, mr * 100, map50 * 100,
                       mean_ap * 100))
            print("*" * 20, "eval end", "*" * 20)
        last_weight_path = os.path.join(
            self.val_cfg['weight_path'],
            "{:s}_{:s}_last.pth".format(self.cfg['model_name'],
                                        self.model_cfg['backbone']))
        best_map_weight_path = os.path.join(
            self.val_cfg['weight_path'],
            "{:s}_{:s}_best_map.pth".format(self.cfg['model_name'],
                                            self.model_cfg['backbone']))
        model_static = self.model.module.state_dict()
        cpkt = {
            "model": model_static,
            "map": mean_ap * 100,
            "epoch": epoch,
            "ema": self.ema.ema.state_dict()
        }
        if self.local_rank != 0:
            return
        torch.save(cpkt, last_weight_path)
        if mean_ap > self.best_map:
            torch.save(cpkt, best_map_weight_path)
            self.best_map = mean_ap

    def run(self):
        for epoch in range(self.optim_cfg['epochs']):
            self.train(epoch)
            if (epoch + 1) % self.val_cfg['interval'] == 0:
                self.val(epoch)