Beispiel #1
0
class lightenYOLOv4(pl.LightningModule):
    def __init__(self, weight_path, resume, exp_name, accumulate=None):
        # precision=16 for fp16

        super().__init__()
        self.model = Build_Model(weight_path=weight_path, resume=resume)
        self.criterion = YoloV4Loss(
            anchors=cfg.MODEL["ANCHORS"],
            strides=cfg.MODEL["STRIDES"],
            iou_threshold_loss=cfg.TRAIN["IOU_THRESHOLD_LOSS"])

        self.evaluator = Evaluator(self.model,
                                   showatt=False,
                                   exp_name=exp_name)
        self.evaluator.clear_predict_file()

    # how you want your model to do inference/predictions
    def forward(self, img):
        p, p_d = self.model(img)
        return p, p_d

    """
    def training_epoch_end(self,outputs):
        #  the function is called after every epoch is completed
        # calculating average loss
        avg_loss = 0
        avg_loss = torch.stack([x['loss'] for x in outputs]).mean()
        # creating log dictionary
        result = pl.TrainResult()
        result.log('val/loss_epoch', avg_loss)
        return result
    """

    # the train loop INDEPENDENT of forward.
    def training_step(self, batch, batch_idx):
        img, label_sbbox, label_mbbox, label_lbbox, sbboxes, mbboxes, lbboxes, _ = batch

        p, p_d = self(img)
        loss, loss_ciou, loss_conf, loss_cls = self.criterion(
            p, p_d, label_sbbox, label_mbbox, label_lbbox, sbboxes, mbboxes,
            lbboxes)

        result = pl.TrainResult(minimize=loss)
        #result = pl.TrainResult(loss)
        #result.log('train_loss_ciou', loss_ciou)
        #result.log('train_loss_conf', loss_conf)
        #result.log('train_loss_cls', loss_cls)
        result.log('train_loss', loss, on_step=True, on_epoch=True)
        return result
        '''
        #https://www.learnopencv.com/tensorboard-with-pytorch-lightning/
        logs={"train_loss": loss,
            "train_loss_ciou":loss_ciou,
            "train_loss_conf":loss_conf,
            "train_loss_cls":loss_cls,}
        batch_dictionary={
            #REQUIRED: It ie required for us to return "loss"
            "loss": loss,
            #optional for batch logging purposes
            "log": logs,
        }
        return batch_dictionary

        '''

    def validation_epoch_end(self, outputs):
        APs = self.evaluator.calc_APs()
        self.evaluator.clear_predict_file()
        mAP = 0
        for i in APs:
            mAP += APs[i]
        mAP = mAP / self.model.getNC()
        result = pl.EvalResult()
        result.log('val/mAP_epoch', torch.Tensor([mAP]).cuda())
        #trainer.logger_connector.logged_metrics
        return result

    def validation_step(self, batch, batch_idx):
        img_batch, label_sbbox, label_mbbox, label_lbbox, sbboxes, mbboxes, lbboxes, img_name = batch

        for idx, img in tqdm(zip(img_name, img_batch)):
            # CHW -> HWC
            img = img.cpu().numpy().transpose(1, 2, 0)
            bboxes_prd = self.evaluator.get_bbox(img,
                                                 multi_test=False,
                                                 flip_test=False)
            self.evaluator.store_bbox(idx, bboxes_prd)
        '''
        loss, loss_ciou, loss_conf, loss_cls = self.criterion(p, p_d, label_sbbox, label_mbbox,
                                                  label_lbbox, sbboxes, mbboxes, lbboxes)

        self.log('val_loss_ciou', loss_ciou)
        self.log('val_loss_conf', loss_conf)
        self.log('val_loss_cls', loss_cls)
        self.log('val_loss', loss)
        '''
        return 1

    def test_step(self, batch, batch_idx):
        img, label_sbbox, label_mbbox, label_lbbox, sbboxes, mbboxes, lbboxes = batch

        p, p_d = self(img)
        loss, loss_ciou, loss_conf, loss_cls = self.criterion(
            p, p_d, label_sbbox, label_mbbox, label_lbbox, sbboxes, mbboxes,
            lbboxes)
        #, loss_ciou, loss_conf, loss_cls
        return loss

    def configure_optimizers(self):
        optimizer = optim.SGD(self.model.parameters(),
                              lr=cfg.TRAIN["LR_INIT"],
                              momentum=cfg.TRAIN["MOMENTUM"],
                              weight_decay=cfg.TRAIN["WEIGHT_DECAY"])
        return optimizer
Beispiel #2
0
class Trainer(object):
    def __init__(self, weight_path, resume, gpu_id, accumulate):
        init_seeds(0)
        self.device = gpu.select_device(gpu_id)
        self.start_epoch = 0
        self.best_mAP = 0.
        self.accumulate = accumulate
        self.epochs = cfg.TRAIN["EPOCHS"]
        self.weight_path = weight_path
        self.multi_scale_train = cfg.TRAIN["MULTI_SCALE_TRAIN"]
        self.train_dataset = data.Build_Dataset(
            anno_file_type="train", img_size=cfg.TRAIN["TRAIN_IMG_SIZE"])
        print('train img size is {}'.format(cfg.TRAIN["TRAIN_IMG_SIZE"]))
        self.train_dataloader = DataLoader(
            self.train_dataset,
            batch_size=cfg.TRAIN["BATCH_SIZE"],
            num_workers=cfg.TRAIN["NUMBER_WORKERS"],
            shuffle=True,
            pin_memory=True)
        self.yolov4 = Build_Model().to(self.device)

        self.optimizer = optim.SGD(self.yolov4.parameters(),
                                   lr=cfg.TRAIN["LR_INIT"],
                                   momentum=cfg.TRAIN["MOMENTUM"],
                                   weight_decay=cfg.TRAIN["WEIGHT_DECAY"])

        self.criterion = YoloV4Loss(
            anchors=cfg.MODEL["ANCHORS"],
            strides=cfg.MODEL["STRIDES"],
            iou_threshold_loss=cfg.TRAIN["IOU_THRESHOLD_LOSS"])

        self.__load_model_weights(weight_path, resume)

        self.scheduler = cosine_lr_scheduler.CosineDecayLR(
            self.optimizer,
            T_max=self.epochs * len(self.train_dataloader),
            lr_init=cfg.TRAIN["LR_INIT"],
            lr_min=cfg.TRAIN["LR_END"],
            warmup=cfg.TRAIN["WARMUP_EPOCHS"] * len(self.train_dataloader))

    def __load_model_weights(self, weight_path, resume):
        if resume:
            last_weight = os.path.join(
                os.path.split(weight_path)[0], "last.pt")
            chkpt = torch.load(last_weight, map_location=self.device)
            self.yolov4.load_state_dict(chkpt['model'])

            self.start_epoch = chkpt['epoch'] + 1
            if chkpt['optimizer'] is not None:
                self.optimizer.load_state_dict(chkpt['optimizer'])
                self.best_mAP = chkpt['best_mAP']
            del chkpt
        else:
            self.yolov4.load_darknet_weights(weight_path)

    # def __save_model_weights(self, epoch, mAP):
    #     if mAP > self.best_mAP:
    #         self.best_mAP = mAP
    #     best_weight = os.path.join(os.path.split(self.weight_path)[0], "best.pt")
    #     last_weight = os.path.join(os.path.split(self.weight_path)[0], "last.pt")
    #     chkpt = {'epoch': epoch,
    #              'best_mAP': self.best_mAP,
    #              'model': self.yolov3.state_dict(),
    #              'optimizer': self.optimizer.state_dict()}
    #     torch.save(chkpt, last_weight)
    #
    #     if self.best_mAP == mAP:
    #         torch.save(chkpt['model'], best_weight)
    #
    #     if epoch > 0 and epoch % 10 == 0:
    #         torch.save(chkpt, os.path.join(os.path.split(self.weight_path)[0], 'backup_epoch%g.pt'%epoch))
    #     del chkpt

    def __save_model_weights(self, epoch, loss):
        weight = os.path.join(
            os.path.split(self.weight_path)[0],
            "epoch{}, loss{}.pt".format(epoch, loss))
        chkpt = {
            'epoch': epoch,
            'model': self.yolov4.state_dict(),
            'optimizer': self.optimizer.state_dict()
        }
        torch.save(chkpt['model'], weight)

    def train(self):
        global writer  # 创建一个SummaryWriter实例
        logger.info(
            "Training start,img size is: {:d},batchsize is: {:d},work number is {:d}"
            .format(cfg.TRAIN["TRAIN_IMG_SIZE"], cfg.TRAIN["BATCH_SIZE"],
                    cfg.TRAIN["NUMBER_WORKERS"]))
        logger.info(self.yolov4)
        logger.info("Train datasets number is : {}".format(
            len(self.train_dataset)))

        self.yolov4, self.optimizer = amp.initialize(self.yolov4,
                                                     self.optimizer,
                                                     opt_level='O1',
                                                     verbosity=0)
        logger.info("        =======  start  training   ======     ")
        for epoch in range(self.start_epoch, self.epochs):
            start = time.time()
            self.yolov4.train()

            mloss = torch.zeros(4)
            logger.info("===Epoch:[{}/{}]===".format(epoch, self.epochs))
            for i, (imgs, label_sbbox, label_mbbox, label_lbbox, sbboxes,
                    mbboxes, lbboxes) in enumerate(self.train_dataloader):

                self.scheduler.step(len(self.train_dataloader) * epoch + i)

                imgs = imgs.to(self.device)
                label_sbbox = label_sbbox.to(self.device)
                label_mbbox = label_mbbox.to(self.device)
                label_lbbox = label_lbbox.to(self.device)
                sbboxes = sbboxes.to(self.device)
                mbboxes = mbboxes.to(self.device)
                lbboxes = lbboxes.to(self.device)

                p, p_d = self.yolov4(imgs)

                loss, loss_giou, loss_conf, loss_cls = self.criterion(
                    p, p_d, label_sbbox, label_mbbox, label_lbbox, sbboxes,
                    mbboxes, lbboxes)

                with amp.scale_loss(loss, self.optimizer) as scaled_loss:
                    scaled_loss.backward()
                # Accumulate gradient for x batches before optimizing
                if i % self.accumulate == 0:
                    self.optimizer.step()
                    self.optimizer.zero_grad()

                # Update running mean of tracked metrics
                loss_items = torch.tensor(
                    [loss_giou, loss_conf, loss_cls, loss])
                mloss = (mloss * i + loss_items) / (i + 1)

                # Print batch results
                if i % 10 == 0:

                    logger.info(
                        "  === Epoch:[{:3}/{}],step:[{:3}/{}],img_size:[{:3}],total_loss:{:.4f}|loss_giou:{:.4f}|loss_conf:{:.4f}|loss_cls:{:.4f}|lr:{:.4f}"
                        .format(epoch, self.epochs, i,
                                len(self.train_dataloader) - 1,
                                self.train_dataset.img_size, mloss[3],
                                mloss[0], mloss[1], mloss[2],
                                self.optimizer.param_groups[0]['lr']))
                    writer.add_scalar('train_loss', mloss[3], i)
                    # multi-sclae training (320-608 pixels) every 10 batches
                if self.multi_scale_train and (i + 1) % 10 == 0:
                    # self.train_dataset.img_size = random.choice(range(5, 15)) * 32 # for imgsize 320
                    # self.train_dataset.img_size = random.choice(range(12, 22)) * 32  # for imgsize 544
                    self.train_dataset.img_size = random.choice(range(10,
                                                                      20)) * 32

            if epoch >= 0 and cfg.TRAIN["DATA_TYPE"] == 'VOC':
                self.__save_model_weights(epoch, mloss[3])
                print('save weights done')
                # mAP = 0
                # if epoch >= 0:
                #     logger.info("===== Validate =====".format(epoch, self.epochs))
                #     with torch.no_grad():
                #         APs = Evaluator(self.yolov3,epoch,showatt=False).APs_voc()
                #         for i in APs:
                #             print("{} --> mAP : {}".format(i, APs[i]))
                #             mAP += APs[i]
                #         mAP = mAP / self.train_dataset.num_classes
                #         writer.add_scalar('mAP', mAP, epoch)
                #     logger.info("  ===test mAP:{:.3f}".format(mAP))
                # writer.add_scalar('mAP', mAP, epoch)
            elif epoch >= 0 and cfg.TRAIN["DATA_TYPE"] == 'COCO':
                evaluator = COCOAPIEvaluator(model_type='YOLOv4',
                                             data_dir=cfg.DATA_PATH,
                                             img_size=cfg.VAL["TEST_IMG_SIZE"],
                                             confthre=0.08,
                                             nmsthre=cfg.VAL["NMS_THRESH"])
                ap50_95, ap50 = evaluator.evaluate(self.yolov4)
                # logger.info('ap50_95:{}|ap50:{}'.format(ap50_95, ap50))
                writer.add_scalar('val/COCOAP50', ap50, epoch)
                writer.add_scalar('val/COCOAP50_95', ap50_95, epoch)
                self.__save_model_weights(epoch, ap50)
            else:
                assert print('dataset must be VOC or COCO')
            end = time.time()
            logger.info("  ===cost time:{:.4f}s".format(end - start))
        logger.info(
            "=====Training Finished.   best_test_mAP:{:.3f}%====".format(
                self.best_mAP))
Beispiel #3
0
class Trainer(object):
    def __init__(self,
                 weight_path=None,
                 resume=False,
                 gpu_id=0,
                 accumulate=1,
                 fp_16=False):
        init_seeds(0)
        self.fp_16 = fp_16
        self.device = gpu.select_device(gpu_id)
        self.start_epoch = 0
        self.best_mAP = 0.0
        self.accumulate = accumulate
        self.weight_path = weight_path
        self.multi_scale_train = cfg.TRAIN["MULTI_SCALE_TRAIN"]
        self.showatt = cfg.TRAIN["showatt"]
        if self.multi_scale_train:
            print("Using multi scales training")
        else:
            print("train img size is {}".format(cfg.TRAIN["TRAIN_IMG_SIZE"]))
        self.train_dataset = data.Build_Dataset(
            anno_file_type="train", img_size=cfg.TRAIN["TRAIN_IMG_SIZE"])
        self.epochs = (cfg.TRAIN["YOLO_EPOCHS"] if cfg.MODEL_TYPE["TYPE"]
                       == "YOLOv4" else cfg.TRAIN["Mobilenet_YOLO_EPOCHS"])
        self.eval_epoch = (30 if cfg.MODEL_TYPE["TYPE"] == "YOLOv4" else 50)
        self.train_dataloader = DataLoader(
            self.train_dataset,
            batch_size=cfg.TRAIN["BATCH_SIZE"],
            num_workers=cfg.TRAIN["NUMBER_WORKERS"],
            shuffle=True,
            pin_memory=True,
        )

        self.yolov4 = Build_Model(weight_path=weight_path,
                                  resume=resume,
                                  showatt=self.showatt).to(self.device)

        self.optimizer = optim.SGD(
            self.yolov4.parameters(),
            lr=cfg.TRAIN["LR_INIT"],
            momentum=cfg.TRAIN["MOMENTUM"],
            weight_decay=cfg.TRAIN["WEIGHT_DECAY"],
        )

        self.criterion = YoloV4Loss(
            anchors=cfg.MODEL["ANCHORS"],
            strides=cfg.MODEL["STRIDES"],
            iou_threshold_loss=cfg.TRAIN["IOU_THRESHOLD_LOSS"],
        )

        self.scheduler = cosine_lr_scheduler.CosineDecayLR(
            self.optimizer,
            T_max=self.epochs * len(self.train_dataloader),
            lr_init=cfg.TRAIN["LR_INIT"],
            lr_min=cfg.TRAIN["LR_END"],
            warmup=cfg.TRAIN["WARMUP_EPOCHS"] * len(self.train_dataloader),
        )
        if resume:
            self.__load_resume_weights(weight_path)

    def __load_resume_weights(self, weight_path):

        last_weight = os.path.join(os.path.split(weight_path)[0], "last.pt")
        chkpt = torch.load(last_weight, map_location=self.device)
        self.yolov4.load_state_dict(chkpt["model"])

        self.start_epoch = chkpt["epoch"] + 1
        if chkpt["optimizer"] is not None:
            self.optimizer.load_state_dict(chkpt["optimizer"])
            self.best_mAP = chkpt["best_mAP"]
        del chkpt

    def __save_model_weights(self, epoch, mAP):
        if mAP > self.best_mAP:
            self.best_mAP = mAP
        best_weight = os.path.join(
            os.path.split(self.weight_path)[0], "best.pt")
        last_weight = os.path.join(
            os.path.split(self.weight_path)[0], "last.pt")
        chkpt = {
            "epoch": epoch,
            "best_mAP": self.best_mAP,
            "model": self.yolov4.state_dict(),
            "optimizer": self.optimizer.state_dict(),
        }
        torch.save(chkpt, last_weight)

        if self.best_mAP == mAP:
            torch.save(chkpt["model"], best_weight)

        if epoch > 0 and epoch % 10 == 0:
            torch.save(
                chkpt,
                os.path.join(
                    os.path.split(self.weight_path)[0],
                    "backup_epoch%g.pt" % epoch,
                ),
            )
        del chkpt

    def train(self):
        global writer
        logger.info(
            "Training start,img size is: {:d},batchsize is: {:d},work number is {:d}"
            .format(
                cfg.TRAIN["TRAIN_IMG_SIZE"],
                cfg.TRAIN["BATCH_SIZE"],
                cfg.TRAIN["NUMBER_WORKERS"],
            ))
        logger.info(self.yolov4)
        logger.info("Train datasets number is : {}".format(
            len(self.train_dataset)))

        def is_valid_number(x):
            return not (math.isnan(x) or math.isinf(x) or x > 1e4)

        if self.fp_16:
            self.yolov4, self.optimizer = amp.initialize(self.yolov4,
                                                         self.optimizer,
                                                         opt_level="O1",
                                                         verbosity=0)
        logger.info("        =======  start  training   ======     ")
        for epoch in range(self.start_epoch, self.epochs):
            start = time.time()
            self.yolov4.train()

            mloss = torch.zeros(4)
            logger.info("===Epoch:[{}/{}]===".format(epoch, self.epochs))
            for i, (
                    imgs,
                    label_sbbox,
                    label_mbbox,
                    label_lbbox,
                    sbboxes,
                    mbboxes,
                    lbboxes,
            ) in enumerate(self.train_dataloader):
                self.scheduler.step(
                    len(self.train_dataloader) /
                    (cfg.TRAIN["BATCH_SIZE"]) * epoch + i)

                imgs = imgs.to(self.device)
                label_sbbox = label_sbbox.to(self.device)
                label_mbbox = label_mbbox.to(self.device)
                label_lbbox = label_lbbox.to(self.device)
                sbboxes = sbboxes.to(self.device)
                mbboxes = mbboxes.to(self.device)
                lbboxes = lbboxes.to(self.device)

                p, p_d = self.yolov4(imgs)

                loss, loss_ciou, loss_conf, loss_cls = self.criterion(
                    p,
                    p_d,
                    label_sbbox,
                    label_mbbox,
                    label_lbbox,
                    sbboxes,
                    mbboxes,
                    lbboxes,
                )
                if is_valid_number(loss.item()):
                    if self.fp_16:
                        with amp.scale_loss(loss,
                                            self.optimizer) as scaled_loss:
                            scaled_loss.backward()
                    else:
                        loss.backward()
                # Accumulate gradient for x batches before optimizing
                if i % self.accumulate == 0:
                    self.optimizer.step()
                    self.optimizer.zero_grad()

                # Update running mean of tracked metrics
                loss_items = torch.tensor(
                    [loss_ciou, loss_conf, loss_cls, loss])
                mloss = (mloss * i + loss_items) / (i + 1)

                # Print batch results
                if i % 10 == 0:

                    logger.info(
                        "  === Epoch:[{:3}/{}],step:[{:3}/{}],img_size:[{:3}],total_loss:{:.4f}|loss_ciou:{:.4f}|loss_conf:{:.4f}|loss_cls:{:.4f}|lr:{:.4f}"
                        .format(
                            epoch,
                            self.epochs,
                            i,
                            len(self.train_dataloader) - 1,
                            self.train_dataset.img_size,
                            mloss[3],
                            mloss[0],
                            mloss[1],
                            mloss[2],
                            self.optimizer.param_groups[0]["lr"],
                        ))
                    writer.add_scalar(
                        "loss_ciou",
                        mloss[0],
                        len(self.train_dataloader) * epoch + i,
                    )
                    writer.add_scalar(
                        "loss_conf",
                        mloss[1],
                        len(self.train_dataloader) * epoch + i,
                    )
                    writer.add_scalar(
                        "loss_cls",
                        mloss[2],
                        len(self.train_dataloader) * epoch + i,
                    )
                    writer.add_scalar(
                        "train_loss",
                        mloss[3],
                        len(self.train_dataloader) * epoch + i,
                    )
                # multi-sclae training (320-608 pixels) every 10 batches
                if self.multi_scale_train and (i + 1) % 10 == 0:
                    self.train_dataset.img_size = (
                        random.choice(range(10, 20)) * 32)

            if (cfg.TRAIN["DATA_TYPE"] == "VOC"
                    or cfg.TRAIN["DATA_TYPE"] == "Customer"):
                mAP = 0.0
                if epoch >= self.eval_epoch:
                    logger.info("===== Validate =====".format(
                        epoch, self.epochs))
                    logger.info("val img size is {}".format(
                        cfg.VAL["TEST_IMG_SIZE"]))
                    with torch.no_grad():
                        APs, inference_time = Evaluator(
                            self.yolov4, showatt=self.showatt).APs_voc()
                        for i in APs:
                            logger.info("{} --> mAP : {}".format(i, APs[i]))
                            mAP += APs[i]
                        mAP = mAP / self.train_dataset.num_classes
                        logger.info("mAP : {}".format(mAP))
                        logger.info(
                            "inference time: {:.2f} ms".format(inference_time))
                        writer.add_scalar("mAP", mAP, epoch)
                        self.__save_model_weights(epoch, mAP)
                        logger.info("save weights done")
                    logger.info("  ===test mAP:{:.3f}".format(mAP))
            elif epoch >= 0 and cfg.TRAIN["DATA_TYPE"] == "COCO":
                evaluator = COCOAPIEvaluator(
                    model_type="YOLOv4",
                    data_dir=cfg.DATA_PATH,
                    img_size=cfg.VAL["TEST_IMG_SIZE"],
                    confthre=0.08,
                    nmsthre=cfg.VAL["NMS_THRESH"],
                )
                ap50_95, ap50 = evaluator.evaluate(self.yolov4)
                logger.info("ap50_95:{}|ap50:{}".format(ap50_95, ap50))
                writer.add_scalar("val/COCOAP50", ap50, epoch)
                writer.add_scalar("val/COCOAP50_95", ap50_95, epoch)
                self.__save_model_weights(epoch, ap50)
                print("save weights done")
            end = time.time()
            logger.info("  ===cost time:{:.4f}s".format(end - start))
        logger.info(
            "=====Training Finished.   best_test_mAP:{:.3f}%====".format(
                self.best_mAP))
Beispiel #4
0
class Trainer(object):
    _resume = False
    _fine_tune = False
    def __init__(self, log_dir, resume=False, fine_tune=False):
        init_seeds(0)
        self._fine_tune = fine_tune
        self._resume = resume
        if self._fine_tune:
            self.__prepare_fine_tune()
        self.device = gpu.select_device()
        self.start_epoch = 0
        self.best_mAP = 0.
        self.accumulate = cfg.TRAIN.ACCUMULATE
        self.log_dir = log_dir
        self.weight_path = "yolov4.weights"
        self.multi_scale_train = cfg.TRAIN.MULTI_SCALE_TRAIN
        if self.multi_scale_train:
            print('Using multi scales training')
        else:
            print('train img size is {}'.format(cfg.TRAIN.TRAIN_IMG_SIZE))
        self.train_dataset = data.Build_Train_Dataset(anno_file=cfg.TRAIN.ANNO_FILE, anno_file_type="train", img_size=cfg.TRAIN.TRAIN_IMG_SIZE)

        self.epochs = cfg.TRAIN.YOLO_EPOCHS if cfg.MODEL.MODEL_TYPE == 'YOLOv4' else cfg.TRAIN.Mobilenet_YOLO_EPOCHS
        self.train_dataloader = DataLoader(self.train_dataset,
                                           batch_size=cfg.TRAIN.BATCH_SIZE//cfg.TRAIN.ACCUMULATE,
                                           num_workers=cfg.TRAIN.NUMBER_WORKERS,
                                           shuffle=True, pin_memory=True)
        self.yolov4 = Build_Model(weight_path="yolov4.weights", resume=resume)

        self.yolov4 = self.yolov4.to(self.device)

        self.optimizer = optim.SGD(self.yolov4.parameters(), lr=cfg.TRAIN.LR_INIT,
                                   momentum=cfg.TRAIN.MOMENTUM, weight_decay=cfg.TRAIN.WEIGHT_DECAY)

        self.criterion = YoloV4Loss(anchors=cfg.MODEL.ANCHORS, strides=cfg.MODEL.STRIDES,
                                    iou_threshold_loss=cfg.TRAIN.IOU_THRESHOLD_LOSS)

        self.scheduler = cosine_lr_scheduler.CosineDecayLR(self.optimizer,
                                                          T_max=self.epochs*len(self.train_dataloader),
                                                          lr_init=cfg.TRAIN.LR_INIT,
                                                          lr_min=cfg.TRAIN.LR_END,
                                                          warmup=cfg.TRAIN.WARMUP_EPOCHS*len(self.train_dataloader))
        if resume: self.__load_resume_weights()
        if self._fine_tune: self.__load_best_weights()
        
    def __prepare_fine_tune(self):
        cfg.defrost()
        cfg.TRAIN.ANNO_FILE = cfg.FINE_TUNE.ANNO_FILE
        cfg.TRAIN.YOLO_EPOCHS = cfg.FINE_TUNE.YOLO_EPOCHS
        cfg.TRAIN.LR_INIT = cfg.FINE_TUNE.LR_INIT
        cfg.TRAIN.LR_END = cfg.FINE_TUNE.LR_END
        cfg.TRAIN.WARMUP_EPOCHS = cfg.FINE_TUNE.WARMUP_EPOCHS
        cfg.freeze()
        
    def __load_best_weights(self):
        best_weight = os.path.join(log_dir,"checkpoints", "best.pt")
        last_weight = os.path.join(log_dir,"checkpoints", "last.pt")    
        shutil.copy2(best_weight,
                     best_weight.replace("best.pt","best_before_fine_tune.pt"))
        shutil.copy2(last_weight,
                     last_weight.replace("last.pt","last_before_fine_tune.pt"))
        last_chkpt = torch.load(last_weight, map_location=self.device)
        best_chkpt = torch.load(best_weight, map_location=self.device)
        self.yolov4.load_state_dict(best_chkpt)
        self.best_mAP = 0 
        del last_chkpt, best_chkpt

        
    def __load_resume_weights(self):

        last_weight = os.path.join(log_dir,"checkpoints", "last.pt")
        chkpt = torch.load(last_weight, map_location=self.device)
        self.yolov4.load_state_dict(chkpt['model'])

        self.start_epoch = chkpt['epoch'] + 1
        if chkpt['optimizer'] is not None:
            self.optimizer.load_state_dict(chkpt['optimizer'])
            self.best_mAP = chkpt['best_mAP']
        del chkpt

    def __save_model_weights(self, epoch, mAP):
        if mAP > self.best_mAP:
            self.best_mAP = mAP
        best_weight = os.path.join(log_dir,"checkpoints", "best.pt")
        last_weight = os.path.join(log_dir,"checkpoints", "last.pt")
        chkpt = {'epoch': epoch,
                 'best_mAP': self.best_mAP,
                 'model': self.yolov4.module.state_dict() if torch.cuda.device_count()>1 else self.yolov4.state_dict(),
                 'optimizer': self.optimizer.state_dict()}
        torch.save(chkpt, last_weight)

        if self.best_mAP == mAP:
            torch.save(chkpt['model'], best_weight)

        if self._fine_tune and epoch % 5 == 0:
            torch.save(chkpt['model'], os.path.join(log_dir,"checkpoints", 'backup_fine_tune_epoch_{:02d}.pt'.format(epoch)))
        del chkpt

    def train(self):
        global writer
        logger.info("Training start,img size is: {:d},batchsize is: {:d}, subdivision: {:d}, worker number is {:d}".format(cfg.TRAIN.TRAIN_IMG_SIZE, cfg.TRAIN.BATCH_SIZE, cfg.TRAIN.ACCUMULATE, cfg.TRAIN.NUMBER_WORKERS))
        logger.info(self.yolov4)
        n_train = len(self.train_dataset)
        n_step = n_train // (cfg.TRAIN.BATCH_SIZE//cfg.TRAIN.ACCUMULATE) + 1
        logger.info("Train datasets number is : {}".format(n_train))
        evaluator = COCOAPIEvaluator(cfg=cfg,
                    img_size=cfg.VAL.TEST_IMG_SIZE,
                    confthre=cfg.VAL.CONF_THRESH,
                    nmsthre=cfg.VAL.NMS_THRESH)

        if torch.cuda.device_count() > 1: self.yolov4 = torch.nn.DataParallel(self.yolov4)
        logger.info("\n===============  start  training   ===============")
        for epoch in range(self.start_epoch, self.epochs):
            start = time.time()
            self.yolov4.train()
            with tqdm(total=n_train, unit="imgs", desc=f'Epoch {epoch}/{self.epochs}', ncols=30) as pbar:
                for i, (imgs, label_sbbox, label_mbbox, label_lbbox, sbboxes, mbboxes, lbboxes) in enumerate(self.train_dataloader):

                    imgs = imgs.to(self.device)
                    label_sbbox = label_sbbox.to(self.device)
                    label_mbbox = label_mbbox.to(self.device)
                    label_lbbox = label_lbbox.to(self.device)
                    sbboxes = sbboxes.to(self.device)
                    mbboxes = mbboxes.to(self.device)
                    lbboxes = lbboxes.to(self.device)

                    p, p_d = self.yolov4(imgs)

                    loss, loss_ciou, loss_conf, loss_cls = self.criterion(p, p_d, label_sbbox, label_mbbox,
                                                    label_lbbox, sbboxes, mbboxes, lbboxes)

                    loss.backward()
                    # Accumulate gradient for x batches before optimizing
                    if i % self.accumulate == 0:
                        self.scheduler.step(n_step*epoch + i)
                        self.optimizer.step()
                        self.optimizer.zero_grad()

                    # Print batch results
                    if i % (5*self.accumulate) == 0:
                        logger.info("{:3}: total_loss:{:.4f} | loss_ciou:{:.4f} | loss_conf:{:.4f} | loss_cls:{:.4f} | lr:{:.6f}".format(
                            self.train_dataset.img_size, loss, loss_ciou, loss_conf, loss_cls, self.optimizer.param_groups[0]['lr']
                        ))
                        writer.add_scalar('train/loss_ciou', loss_ciou, n_step * epoch + i)
                        writer.add_scalar('train/loss_conf', loss_conf, n_step * epoch + i)
                        writer.add_scalar('train/loss_cls', loss_cls, n_step * epoch + i)
                        writer.add_scalar('train/train_loss', loss, n_step * epoch + i)
                        writer.add_scalar('train/lr', self.optimizer.param_groups[0]['lr'], n_step * epoch + i)
                    # multi-sclae training (320-608 pixels) every 10 batches
                    if self.multi_scale_train and (i+1) % (5*self.accumulate) == 0:
                        self.train_dataset.img_size = random.choice(range(10, 20)) * 32
                    pbar.update(imgs.shape[0])
                
            mAP = 0.
            # evaluator = COCOAPIEvaluator(cfg=cfg,
            #                             img_size=cfg.VAL.TEST_IMG_SIZE,
            #                             confthre=cfg.VAL.CONF_THRESH,
            #                             nmsthre=cfg.VAL.NMS_THRESH)
            coco_stat = evaluator.evaluate(self.yolov4)
            logger.info("Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = {:.04f}".format(coco_stat[0]))
            logger.info("Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = {:.04f}".format(coco_stat[1]))            
            logger.info("Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = {:.04f}".format(coco_stat[2]))            
            logger.info("Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = {:.04f}".format(coco_stat[3]))            
            logger.info("Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = {:.04f}".format(coco_stat[4]))            
            logger.info("Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = {:.04f}".format(coco_stat[5]))            
            logger.info("Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = {:.04f}".format(coco_stat[6]))            
            logger.info("Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = {:.04f}".format(coco_stat[7]))            
            logger.info("Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = {:.04f}".format(coco_stat[8]))            
            logger.info("Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = {:.04f}".format(coco_stat[9]))            
            logger.info("Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = {:.04f}".format(coco_stat[10])) 
            logger.info("Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = {:.04f}".format(coco_stat[11]))             
            writer.add_scalar('val/mAP_50_95',  coco_stat[0], epoch)
            writer.add_scalar('val/mAP_50',     coco_stat[1], epoch)
            writer.add_scalar('val/mAP_75',     coco_stat[2], epoch)
            writer.add_scalar('val/mAP_small',  coco_stat[3], epoch)
            writer.add_scalar('val/mAP_medium', coco_stat[4], epoch)
            writer.add_scalar('val/mAP_large',  coco_stat[5], epoch)
            writer.add_scalar('val/mAR_max_1',  coco_stat[6], epoch)
            writer.add_scalar('val/mAR_max_10', coco_stat[7], epoch)
            writer.add_scalar('val/mAR_max_100',coco_stat[8], epoch)
            writer.add_scalar('val/mAR_small',  coco_stat[9], epoch)
            writer.add_scalar('val/mAR_medium', coco_stat[10], epoch)
            writer.add_scalar('val/mAR_large',  coco_stat[11], epoch)

            self.__save_model_weights(epoch, coco_stat[0])
            logger.info('save weights done')
        
            end = time.time()
            logger.info("cost time:{:.4f}s".format(end - start))
        logger.info("=====Training Finished.   best_test_mAP:{:.3f}%====".format(self.best_mAP))
Beispiel #5
0
train_dataset = BuildDataset(train_anno_path)

train_dataloader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    num_workers=0,
    shuffle=True,
    pin_memory=True,
)

# model
yolov4 = Build_Model(weight_path=weight_path).to(device)

optimizer = optim.SGD(
    yolov4.parameters(),
    lr=cfg.TRAIN["LR_INIT"],
    momentum=cfg.TRAIN["MOMENTUM"],
    weight_decay=cfg.TRAIN["WEIGHT_DECAY"],
)

criterion = YoloV4Loss(
    anchors=cfg.MODEL["ANCHORS"],
    strides=cfg.MODEL["STRIDES"],
    iou_threshold_loss=cfg.TRAIN["IOU_THRESHOLD_LOSS"],
)

scheduler = cosine_lr_scheduler.CosineDecayLR(
    optimizer,
    T_max=epochs * len(train_dataloader),
    lr_init=cfg.TRAIN["LR_INIT"],