Ejemplo n.º 1
0
def run(mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225],
        graphviz=True,
        epoch=100,
        input_size=[512, 512],
        batch_size=16,
        batch_log=100,
        batch_interval=10,
        subdivision=4,
        train_dataset_path="Dataset/train",
        valid_dataset_path="Dataset/valid",
        multiscale=True,
        factor_scale=[8, 5],
        data_augmentation=True,
        num_workers=4,
        optimizer="ADAM",
        lambda_off=1,
        lambda_size=0.1,
        save_period=5,
        load_period=10,
        learning_rate=0.001,
        decay_lr=0.999,
        decay_step=10,
        GPU_COUNT=0,
        base=18,
        pretrained_base=True,
        pretrained_path="modelparam",
        AMP=True,
        valid_size=8,
        eval_period=5,
        tensorboard=True,
        valid_graph_path="valid_Graph",
        using_mlflow=True,
        topk=100,
        plot_class_thresh=0.5):
    '''
    AMP 가 모든 연산을 지원하지는 않는다.
    modulated convolution을 지원하지 않음
    '''
    if GPU_COUNT == 0:
        ctx = mx.cpu(0)
        AMP = False
    elif GPU_COUNT == 1:
        ctx = mx.gpu(0)
    else:
        ctx = [mx.gpu(i) for i in range(GPU_COUNT)]

    # 운영체제 확인
    if platform.system() == "Linux":
        logging.info(f"{platform.system()} OS")
    elif platform.system() == "Windows":
        logging.info(f"{platform.system()} OS")
    else:
        logging.info(f"{platform.system()} OS")

    if isinstance(ctx, (list, tuple)):
        for i, c in enumerate(ctx):
            free_memory, total_memory = mx.context.gpu_memory_info(i)
            free_memory = round(free_memory / (1024 * 1024 * 1024), 2)
            total_memory = round(total_memory / (1024 * 1024 * 1024), 2)
            logging.info(
                f'Running on {c} / free memory : {free_memory}GB / total memory {total_memory}GB'
            )
    else:
        if GPU_COUNT == 1:
            free_memory, total_memory = mx.context.gpu_memory_info(0)
            free_memory = round(free_memory / (1024 * 1024 * 1024), 2)
            total_memory = round(total_memory / (1024 * 1024 * 1024), 2)
            logging.info(
                f'Running on {ctx} / free memory : {free_memory}GB / total memory {total_memory}GB'
            )
        else:
            logging.info(f'Running on {ctx}')

    if GPU_COUNT > 0 and batch_size < GPU_COUNT:
        logging.info("batch size must be greater than gpu number")
        exit(0)

    if AMP:
        amp.init()

    if multiscale:
        logging.info("Using MultiScale")

    if data_augmentation:
        logging.info("Using Data Augmentation")

    logging.info("training Center Detector")
    input_shape = (1, 3) + tuple(input_size)

    scale_factor = 4  # 고정
    logging.info(f"scale factor {scale_factor}")

    try:
        train_dataloader, train_dataset = traindataloader(
            multiscale=multiscale,
            factor_scale=factor_scale,
            augmentation=data_augmentation,
            path=train_dataset_path,
            input_size=input_size,
            batch_size=batch_size,
            batch_interval=batch_interval,
            num_workers=num_workers,
            shuffle=True,
            mean=mean,
            std=std,
            scale_factor=scale_factor,
            make_target=True)
        valid_dataloader, valid_dataset = validdataloader(
            path=valid_dataset_path,
            input_size=input_size,
            batch_size=valid_size,
            num_workers=num_workers,
            shuffle=True,
            mean=mean,
            std=std,
            scale_factor=scale_factor,
            make_target=True)

    except Exception as E:
        logging.info(E)
        exit(0)

    train_update_number_per_epoch = len(train_dataloader)
    if train_update_number_per_epoch < 1:
        logging.warning("train batch size가 데이터 수보다 큼")
        exit(0)

    valid_list = glob.glob(os.path.join(valid_dataset_path, "*"))
    if valid_list:
        valid_update_number_per_epoch = len(valid_dataloader)
        if valid_update_number_per_epoch < 1:
            logging.warning("valid batch size가 데이터 수보다 큼")
            exit(0)

    num_classes = train_dataset.num_class  # 클래스 수
    name_classes = train_dataset.classes

    optimizer = optimizer.upper()
    if pretrained_base:
        model = str(input_size[0]) + "_" + str(
            input_size[1]) + "_" + optimizer + "_P" + "CENTER_RES" + str(base)
    else:
        model = str(input_size[0]) + "_" + str(
            input_size[1]) + "_" + optimizer + "_CENTER_RES" + str(base)

    weight_path = f"weights/{model}"
    sym_path = os.path.join(weight_path, f'{model}-symbol.json')
    param_path = os.path.join(weight_path, f'{model}-{load_period:04d}.params')

    if os.path.exists(param_path) and os.path.exists(sym_path):
        start_epoch = load_period
        logging.info(f"loading {os.path.basename(param_path)} weights\n")
        net = gluon.SymbolBlock.imports(sym_path, ['data'],
                                        param_path,
                                        ctx=ctx)
    else:
        start_epoch = 0
        net = CenterNet(base=base,
                        heads=OrderedDict([('heatmap', {
                            'num_output': num_classes,
                            'bias': -2.19
                        }), ('offset', {
                            'num_output': 2
                        }), ('wh', {
                            'num_output': 2
                        })]),
                        head_conv_channel=64,
                        pretrained=pretrained_base,
                        root=pretrained_path,
                        use_dcnv2=False,
                        ctx=ctx)

        if isinstance(ctx, (list, tuple)):
            net.summary(mx.nd.ones(shape=input_shape, ctx=ctx[0]))
        else:
            net.summary(mx.nd.ones(shape=input_shape, ctx=ctx))
        '''
        active (bool, default True) – Whether to turn hybrid on or off.
        static_alloc (bool, default False) – Statically allocate memory to improve speed. Memory usage may increase.
        static_shape (bool, default False) – Optimize for invariant input shapes between iterations. Must also set static_alloc to True. Change of input shapes is still allowed but slower.
        '''
        if multiscale:
            net.hybridize(active=True, static_alloc=True, static_shape=False)
        else:
            net.hybridize(active=True, static_alloc=True, static_shape=True)

    if start_epoch + 1 >= epoch + 1:
        logging.info("this model has already been optimized")
        exit(0)

    if tensorboard:
        summary = SummaryWriter(logdir=os.path.join("mxboard", model),
                                max_queue=10,
                                flush_secs=10,
                                verbose=False)
        if isinstance(ctx, (list, tuple)):
            net.forward(mx.nd.ones(shape=input_shape, ctx=ctx[0]))
        else:
            net.forward(mx.nd.ones(shape=input_shape, ctx=ctx))
        summary.add_graph(net)
    if graphviz:
        gluoncv.utils.viz.plot_network(net,
                                       shape=input_shape,
                                       save_prefix=model)

    # optimizer
    unit = 1 if (len(train_dataset) //
                 batch_size) < 1 else len(train_dataset) // batch_size
    step = unit * decay_step
    lr_sch = mx.lr_scheduler.FactorScheduler(step=step,
                                             factor=decay_lr,
                                             stop_factor_lr=1e-12,
                                             base_lr=learning_rate)

    for p in net.collect_params().values():
        if p.grad_req != "null":
            p.grad_req = 'add'

    if AMP:
        '''
        update_on_kvstore : bool, default None
        Whether to perform parameter updates on kvstore. If None, then trainer will choose the more
        suitable option depending on the type of kvstore. If the `update_on_kvstore` argument is
        provided, environment variable `MXNET_UPDATE_ON_KVSTORE` will be ignored.
        '''
        if optimizer.upper() == "ADAM":
            trainer = gluon.Trainer(
                net.collect_params(),
                optimizer,
                optimizer_params={
                    "learning_rate": learning_rate,
                    "lr_scheduler": lr_sch,
                    "beta1": 0.9,
                    "beta2": 0.999,
                    'multi_precision': False
                },
                update_on_kvstore=False)  # for Dynamic loss scaling
        elif optimizer.upper() == "RMSPROP":
            trainer = gluon.Trainer(
                net.collect_params(),
                optimizer,
                optimizer_params={
                    "learning_rate": learning_rate,
                    "lr_scheduler": lr_sch,
                    "gamma1": 0.9,
                    "gamma2": 0.999,
                    'multi_precision': False
                },
                update_on_kvstore=False)  # for Dynamic loss scaling
        elif optimizer.upper() == "SGD":
            trainer = gluon.Trainer(
                net.collect_params(),
                optimizer,
                optimizer_params={
                    "learning_rate": learning_rate,
                    "lr_scheduler": lr_sch,
                    "wd": 0.0001,
                    "momentum": 0.9,
                    'multi_precision': False
                },
                update_on_kvstore=False)  # for Dynamic loss scaling
        else:
            logging.error("optimizer not selected")
            exit(0)

        amp.init_trainer(trainer)

    else:
        if optimizer.upper() == "ADAM":
            trainer = gluon.Trainer(net.collect_params(),
                                    optimizer,
                                    optimizer_params={
                                        "learning_rate": learning_rate,
                                        "lr_scheduler": lr_sch,
                                        "beta1": 0.9,
                                        "beta2": 0.999,
                                        'multi_precision': False
                                    })
        elif optimizer.upper() == "RMSPROP":
            trainer = gluon.Trainer(net.collect_params(),
                                    optimizer,
                                    optimizer_params={
                                        "learning_rate": learning_rate,
                                        "lr_scheduler": lr_sch,
                                        "gamma1": 0.9,
                                        "gamma2": 0.999,
                                        'multi_precision': False
                                    })
        elif optimizer.upper() == "SGD":
            trainer = gluon.Trainer(net.collect_params(),
                                    optimizer,
                                    optimizer_params={
                                        "learning_rate": learning_rate,
                                        "lr_scheduler": lr_sch,
                                        "wd": 0.0001,
                                        "momentum": 0.9,
                                        'multi_precision': False
                                    })

        else:
            logging.error("optimizer not selected")
            exit(0)

    heatmapfocalloss = HeatmapFocalLoss(from_sigmoid=True, alpha=2, beta=4)
    normedl1loss = NormedL1Loss()
    prediction = Prediction(batch_size=valid_size,
                            topk=topk,
                            scale=scale_factor)
    precision_recall = Voc_2007_AP(iou_thresh=0.5, class_names=name_classes)

    start_time = time.time()
    for i in tqdm(range(start_epoch + 1, epoch + 1, 1),
                  initial=start_epoch + 1,
                  total=epoch):

        heatmap_loss_sum = 0
        offset_loss_sum = 0
        wh_loss_sum = 0
        time_stamp = time.time()
        '''
        target generator를 train_dataloader에서 만들어 버리는게 학습 속도가 훨씬 빠르다. 
        '''

        for batch_count, (image, _, heatmap, offset_target, wh_target,
                          mask_target, _) in enumerate(train_dataloader,
                                                       start=1):
            td_batch_size = image.shape[0]

            image_split = mx.nd.split(data=image,
                                      num_outputs=subdivision,
                                      axis=0)
            heatmap_split = mx.nd.split(data=heatmap,
                                        num_outputs=subdivision,
                                        axis=0)
            offset_target_split = mx.nd.split(data=offset_target,
                                              num_outputs=subdivision,
                                              axis=0)
            wh_target_split = mx.nd.split(data=wh_target,
                                          num_outputs=subdivision,
                                          axis=0)
            mask_target_split = mx.nd.split(data=mask_target,
                                            num_outputs=subdivision,
                                            axis=0)

            if subdivision == 1:
                image_split = [image_split]
                heatmap_split = [heatmap_split]
                offset_target_split = [offset_target_split]
                wh_target_split = [wh_target_split]
                mask_target_split = [mask_target_split]
            '''
            autograd 설명
            https://mxnet.apache.org/api/python/docs/tutorials/getting-started/crash-course/3-autograd.html
            '''
            with autograd.record(train_mode=True):

                heatmap_all_losses = []
                offset_all_losses = []
                wh_all_losses = []

                for image_part, heatmap_part, offset_target_part, wh_target_part, mask_target_part in zip(
                        image_split, heatmap_split, offset_target_split,
                        wh_target_split, mask_target_split):

                    if GPU_COUNT <= 1:
                        image_part = gluon.utils.split_and_load(
                            image_part, [ctx], even_split=False)
                        heatmap_part = gluon.utils.split_and_load(
                            heatmap_part, [ctx], even_split=False)
                        offset_target_part = gluon.utils.split_and_load(
                            offset_target_part, [ctx], even_split=False)
                        wh_target_part = gluon.utils.split_and_load(
                            wh_target_part, [ctx], even_split=False)
                        mask_target_part = gluon.utils.split_and_load(
                            mask_target_part, [ctx], even_split=False)
                    else:
                        image_part = gluon.utils.split_and_load(
                            image_part, ctx, even_split=False)
                        heatmap_part = gluon.utils.split_and_load(
                            heatmap_part, ctx, even_split=False)
                        offset_target_part = gluon.utils.split_and_load(
                            offset_target_part, ctx, even_split=False)
                        wh_target_part = gluon.utils.split_and_load(
                            wh_target_part, ctx, even_split=False)
                        mask_target_part = gluon.utils.split_and_load(
                            mask_target_part, ctx, even_split=False)

                    # prediction, target space for Data Parallelism
                    heatmap_losses = []
                    offset_losses = []
                    wh_losses = []
                    total_loss = []

                    # gpu N 개를 대비한 코드 (Data Parallelism)
                    for img, heatmap_target, offset_target, wh_target, mask_target in zip(
                            image_part, heatmap_part, offset_target_part,
                            wh_target_part, mask_target_part):
                        heatmap_pred, offset_pred, wh_pred = net(img)
                        heatmap_loss = heatmapfocalloss(
                            heatmap_pred, heatmap_target)
                        offset_loss = normedl1loss(offset_pred, offset_target,
                                                   mask_target) * lambda_off
                        wh_loss = normedl1loss(wh_pred, wh_target,
                                               mask_target) * lambda_size

                        heatmap_losses.append(heatmap_loss.asscalar())
                        offset_losses.append(offset_loss.asscalar())
                        wh_losses.append(wh_loss.asscalar())

                        total_loss.append(heatmap_loss + offset_loss + wh_loss)

                    if AMP:
                        with amp.scale_loss(total_loss,
                                            trainer) as scaled_loss:
                            autograd.backward(scaled_loss)
                    else:
                        autograd.backward(total_loss)

                    heatmap_all_losses.append(sum(heatmap_losses))
                    offset_all_losses.append(sum(offset_losses))
                    wh_all_losses.append(sum(wh_losses))

            trainer.step(batch_size=td_batch_size, ignore_stale_grad=False)
            # 비우기

            for p in net.collect_params().values():
                p.zero_grad()

            heatmap_loss_sum += sum(heatmap_all_losses) / td_batch_size
            offset_loss_sum += sum(offset_all_losses) / td_batch_size
            wh_loss_sum += sum(wh_all_losses) / td_batch_size

            if batch_count % batch_log == 0:
                logging.info(
                    f'[Epoch {i}][Batch {batch_count}/{train_update_number_per_epoch}],'
                    f'[Speed {td_batch_size / (time.time() - time_stamp):.3f} samples/sec],'
                    f'[Lr = {trainer.learning_rate}]'
                    f'[heatmap loss = {sum(heatmap_all_losses) / td_batch_size:.3f}]'
                    f'[offset loss = {sum(offset_all_losses) / td_batch_size:.3f}]'
                    f'[wh loss = {sum(wh_all_losses) / td_batch_size:.3f}]')
            time_stamp = time.time()

        train_heatmap_loss_mean = np.divide(heatmap_loss_sum,
                                            train_update_number_per_epoch)
        train_offset_loss_mean = np.divide(offset_loss_sum,
                                           train_update_number_per_epoch)
        train_wh_loss_mean = np.divide(wh_loss_sum,
                                       train_update_number_per_epoch)
        train_total_loss_mean = train_heatmap_loss_mean + train_offset_loss_mean + train_wh_loss_mean

        logging.info(
            f"train heatmap loss : {train_heatmap_loss_mean} / train offset loss : {train_offset_loss_mean} / train wh loss : {train_wh_loss_mean} / train total loss : {train_total_loss_mean}"
        )

        if i % eval_period == 0 and valid_list:

            heatmap_loss_sum = 0
            offset_loss_sum = 0
            wh_loss_sum = 0

            # loss 구하기
            for image, label, heatmap_all, offset_target_all, wh_target_all, mask_target_all, _ in valid_dataloader:
                vd_batch_size = image.shape[0]

                if GPU_COUNT <= 1:
                    image = gluon.utils.split_and_load(image, [ctx],
                                                       even_split=False)
                    label = gluon.utils.split_and_load(label, [ctx],
                                                       even_split=False)
                    heatmap_split = gluon.utils.split_and_load(
                        heatmap_all, [ctx], even_split=False)
                    offset_target_split = gluon.utils.split_and_load(
                        offset_target_all, [ctx], even_split=False)
                    wh_target_split = gluon.utils.split_and_load(
                        wh_target_all, [ctx], even_split=False)
                    mask_target_split = gluon.utils.split_and_load(
                        mask_target_all, [ctx], even_split=False)
                else:
                    image = gluon.utils.split_and_load(image,
                                                       ctx,
                                                       even_split=False)
                    label = gluon.utils.split_and_load(label,
                                                       ctx,
                                                       even_split=False)
                    heatmap_split = gluon.utils.split_and_load(
                        heatmap_all, ctx, even_split=False)
                    offset_target_split = gluon.utils.split_and_load(
                        offset_target_all, ctx, even_split=False)
                    wh_target_split = gluon.utils.split_and_load(
                        wh_target_all, ctx, even_split=False)
                    mask_target_split = gluon.utils.split_and_load(
                        mask_target_all, ctx, even_split=False)

                # prediction, target space for Data Parallelism
                heatmap_losses = []
                offset_losses = []
                wh_losses = []

                # gpu N 개를 대비한 코드 (Data Parallelism)
                for img, lb, heatmap_target, offset_target, wh_target, mask_target in zip(
                        image, label, heatmap_split, offset_target_split,
                        wh_target_split, mask_target_split):
                    gt_box = lb[:, :, :4]
                    gt_id = lb[:, :, 4:5]
                    heatmap_pred, offset_pred, wh_pred = net(img)

                    id, score, bbox = prediction(heatmap_pred, offset_pred,
                                                 wh_pred)
                    precision_recall.update(pred_bboxes=bbox,
                                            pred_labels=id,
                                            pred_scores=score,
                                            gt_boxes=gt_box * scale_factor,
                                            gt_labels=gt_id)

                    heatmap_loss = heatmapfocalloss(heatmap_pred,
                                                    heatmap_target)
                    offset_loss = normedl1loss(offset_pred, offset_target,
                                               mask_target) * lambda_off
                    wh_loss = normedl1loss(wh_pred, wh_target,
                                           mask_target) * lambda_size

                    heatmap_losses.append(heatmap_loss.asscalar())
                    offset_losses.append(offset_loss.asscalar())
                    wh_losses.append(wh_loss.asscalar())

                heatmap_loss_sum += sum(heatmap_losses) / vd_batch_size
                offset_loss_sum += sum(offset_losses) / vd_batch_size
                wh_loss_sum += sum(wh_losses) / vd_batch_size

            valid_heatmap_loss_mean = np.divide(heatmap_loss_sum,
                                                valid_update_number_per_epoch)
            valid_offset_loss_mean = np.divide(offset_loss_sum,
                                               valid_update_number_per_epoch)
            valid_wh_loss_mean = np.divide(wh_loss_sum,
                                           valid_update_number_per_epoch)
            valid_total_loss_mean = valid_heatmap_loss_mean + valid_offset_loss_mean + valid_wh_loss_mean

            logging.info(
                f"valid heatmap loss : {valid_heatmap_loss_mean} / valid offset loss : {valid_offset_loss_mean} / valid wh loss : {valid_wh_loss_mean} / valid total loss : {valid_total_loss_mean}"
            )

            AP_appender = []
            round_position = 2
            class_name, precision, recall, true_positive, false_positive, threshold = precision_recall.get_PR_list(
            )
            for j, c, p, r in zip(range(len(recall)), class_name, precision,
                                  recall):
                name, AP = precision_recall.get_AP(c, p, r)
                logging.info(
                    f"class {j}'s {name} AP : {round(AP * 100, round_position)}%"
                )
                AP_appender.append(AP)
            mAP_result = np.mean(AP_appender)

            logging.info(f"mAP : {round(mAP_result * 100, round_position)}%")
            precision_recall.get_PR_curve(name=class_name,
                                          precision=precision,
                                          recall=recall,
                                          threshold=threshold,
                                          AP=AP_appender,
                                          mAP=mAP_result,
                                          folder_name=valid_graph_path,
                                          epoch=i)
            precision_recall.reset()

            if tensorboard:
                # gpu N 개를 대비한 코드 (Data Parallelism)
                dataloader_iter = iter(valid_dataloader)
                image, label, _, _, _, _, _ = next(dataloader_iter)

                if GPU_COUNT <= 1:
                    image = gluon.utils.split_and_load(image, [ctx],
                                                       even_split=False)
                    label = gluon.utils.split_and_load(label, [ctx],
                                                       even_split=False)
                else:
                    image = gluon.utils.split_and_load(image,
                                                       ctx,
                                                       even_split=False)
                    label = gluon.utils.split_and_load(label,
                                                       ctx,
                                                       even_split=False)

                ground_truth_colors = {}
                for k in range(num_classes):
                    ground_truth_colors[k] = (0, 0, 1)

                batch_image = []
                heatmap_image = []
                for img, lb in zip(image, label):
                    gt_boxes = lb[:, :, :4]
                    gt_ids = lb[:, :, 4:5]
                    heatmap_pred, offset_pred, wh_pred = net(img)
                    ids, scores, bboxes = prediction(heatmap_pred, offset_pred,
                                                     wh_pred)

                    for ig, gt_id, gt_box, heatmap, id, score, bbox in zip(
                            img, gt_ids, gt_boxes, heatmap_pred, ids, scores,
                            bboxes):
                        ig = ig.transpose((1, 2, 0)) * mx.nd.array(
                            std, ctx=ig.context) + mx.nd.array(mean,
                                                               ctx=ig.context)
                        ig = (ig * 255).clip(0, 255)

                        # heatmap 그리기
                        heatmap = mx.nd.multiply(heatmap,
                                                 255.0)  # 0 ~ 255 범위로 바꾸기
                        heatmap = mx.nd.max(
                            heatmap, axis=0,
                            keepdims=True)  # channel 축으로 가장 큰것 뽑기
                        heatmap = mx.nd.transpose(
                            heatmap,
                            axes=(1, 2, 0))  # (height, width, channel=1)
                        heatmap = mx.nd.repeat(
                            heatmap, repeats=3,
                            axis=-1)  # (height, width, channel=3)
                        heatmap = heatmap.asnumpy(
                        )  # mxnet.ndarray -> numpy.ndarray
                        heatmap = cv2.resize(heatmap,
                                             dsize=(input_size[1],
                                                    input_size[0]))  # 사이즈 원복
                        heatmap = heatmap.astype("uint8")  # float32 -> uint8
                        heatmap = cv2.applyColorMap(heatmap, cv2.COLORMAP_JET)
                        heatmap[:, :,
                                (0, 1, 2)] = heatmap[:, :,
                                                     (2, 1, 0)]  # BGR -> RGB
                        heatmap = np.transpose(
                            heatmap,
                            axes=(2, 0, 1))  # (channel=3, height, width)

                        # ground truth box 그리기
                        ground_truth = plot_bbox(
                            ig,
                            gt_box * scale_factor,
                            scores=None,
                            labels=gt_id,
                            thresh=None,
                            reverse_rgb=True,
                            class_names=valid_dataset.classes,
                            absolute_coordinates=True,
                            colors=ground_truth_colors)
                        # prediction box 그리기
                        prediction_box = plot_bbox(
                            ground_truth,
                            bbox,
                            scores=score,
                            labels=id,
                            thresh=plot_class_thresh,
                            reverse_rgb=False,
                            class_names=valid_dataset.classes,
                            absolute_coordinates=True)

                        # Tensorboard에 그리기 위해 BGR -> RGB / (height, width, channel) -> (channel, height, width) 를한다.
                        prediction_box = cv2.cvtColor(prediction_box,
                                                      cv2.COLOR_BGR2RGB)
                        prediction_box = np.transpose(prediction_box,
                                                      axes=(2, 0, 1))
                        batch_image.append(
                            prediction_box)  # (batch, channel, height, width)
                        heatmap_image.append(heatmap)

                all_image = np.concatenate(
                    [np.array(batch_image),
                     np.array(heatmap_image)], axis=-1)
                summary.add_image(tag="valid_result",
                                  image=all_image,
                                  global_step=i)
                summary.add_scalar(tag="heatmap_loss",
                                   value={
                                       "train_heatmap_loss_mean":
                                       train_heatmap_loss_mean,
                                       "valid_heatmap_loss_mean":
                                       valid_heatmap_loss_mean
                                   },
                                   global_step=i)
                summary.add_scalar(tag="offset_loss",
                                   value={
                                       "train_offset_loss_mean":
                                       train_offset_loss_mean,
                                       "valid_offset_loss_mean":
                                       valid_offset_loss_mean
                                   },
                                   global_step=i)
                summary.add_scalar(tag="wh_loss",
                                   value={
                                       "train_wh_loss_mean":
                                       train_wh_loss_mean,
                                       "valid_wh_loss_mean": valid_wh_loss_mean
                                   },
                                   global_step=i)

                summary.add_scalar(tag="total_loss",
                                   value={
                                       "train_total_loss":
                                       train_total_loss_mean,
                                       "valid_total_loss":
                                       valid_total_loss_mean
                                   },
                                   global_step=i)

                params = net.collect_params().values()
                if GPU_COUNT > 1:
                    for c in ctx:
                        for p in params:
                            summary.add_histogram(tag=p.name,
                                                  values=p.data(ctx=c),
                                                  global_step=i,
                                                  bins='default')
                else:
                    for p in params:
                        summary.add_histogram(tag=p.name,
                                              values=p.data(),
                                              global_step=i,
                                              bins='default')

        if i % save_period == 0:

            if not os.path.exists(weight_path):
                os.makedirs(weight_path)
            '''
            Hybrid models can be serialized as JSON files using the export function
            Export HybridBlock to json format that can be loaded by SymbolBlock.imports, mxnet.mod.Module or the C++ interface.
            When there are only one input, it will have name data. When there Are more than one inputs, they will be named as data0, data1, etc.
            '''
            if GPU_COUNT >= 1:
                context = mx.gpu(0)
            else:
                context = mx.cpu(0)

            postnet = PostNet(net=net, auxnet=prediction)  # 새로운 객체가 생성
            try:
                net.export(os.path.join(weight_path, f"{model}"),
                           epoch=i,
                           remove_amp_cast=True)
                net.save_parameters(os.path.join(weight_path,
                                                 f"{i}.params"))  # onnx 추출용
                # network inference, decoder, nms까지 처리됨 - mxnet c++에서 편리함
                export_block_for_cplusplus(
                    path=os.path.join(weight_path, f"{model}_prepost"),
                    block=postnet,
                    data_shape=tuple(input_size) + tuple((3, )),
                    epoch=i,
                    preprocess=
                    True,  # c++ 에서 inference시 opencv에서 읽은 이미지 그대로 넣으면 됨
                    layout='HWC',
                    ctx=context,
                    remove_amp_cast=True)

            except Exception as E:
                logging.error(f"json, param model export 예외 발생 : {E}")
            else:
                logging.info("json, param model export 성공")
                net.collect_params().reset_ctx(ctx)

    end_time = time.time()
    learning_time = end_time - start_time
    logging.info(f"learning time : 약, {learning_time / 3600:0.2f}H")
    logging.info("optimization completed")

    if using_mlflow:
        ml.log_metric("learning time", round(learning_time / 3600, 2))
Ejemplo n.º 2
0
def run(mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225],
        offset_alloc_size=(64, 64),
        anchors={"shallow": [(10, 13), (16, 30), (33, 23)],
                 "middle": [(30, 61), (62, 45), (59, 119)],
                 "deep": [(116, 90), (156, 198), (373, 326)]},
        graphviz=False,
        epoch=100,
        input_size=[416, 416],
        batch_log=100,
        batch_size=16,
        batch_interval=10,
        subdivision=4,
        train_dataset_path="Dataset/train",
        valid_dataset_path="Dataset/valid",
        multiscale=False,
        factor_scale=[13, 5],
        ignore_threshold=0.5,
        dynamic=False,
        data_augmentation=True,
        num_workers=4,
        optimizer="ADAM",
        save_period=5,
        load_period=10,
        learning_rate=0.001, decay_lr=0.999, decay_step=10,
        GPU_COUNT=0,
        Darknetlayer=53,
        pretrained_base=True,
        pretrained_path="modelparam",
        AMP=True,
        valid_size=8,
        eval_period=5,
        tensorboard=True,
        valid_graph_path="valid_Graph",
        using_mlflow=True,
        multiperclass=True,
        nms_thresh=0.5,
        nms_topk=500,
        iou_thresh=0.5,
        except_class_thresh=0.05,
        plot_class_thresh=0.5):
    if GPU_COUNT == 0:
        ctx = mx.cpu(0)
        AMP = False
    elif GPU_COUNT == 1:
        ctx = mx.gpu(0)
    else:
        ctx = [mx.gpu(i) for i in range(GPU_COUNT)]

    # 운영체제 확인
    if platform.system() == "Linux":
        logging.info(f"{platform.system()} OS")
    elif platform.system() == "Windows":
        logging.info(f"{platform.system()} OS")
    else:
        logging.info(f"{platform.system()} OS")

    if isinstance(ctx, (list, tuple)):
        for i, c in enumerate(ctx):
            free_memory, total_memory = mx.context.gpu_memory_info(i)
            free_memory = round(free_memory / (1024 * 1024 * 1024), 2)
            total_memory = round(total_memory / (1024 * 1024 * 1024), 2)
            logging.info(f'Running on {c} / free memory : {free_memory}GB / total memory {total_memory}GB')
    else:
        if GPU_COUNT == 1:
            free_memory, total_memory = mx.context.gpu_memory_info(0)
            free_memory = round(free_memory / (1024 * 1024 * 1024), 2)
            total_memory = round(total_memory / (1024 * 1024 * 1024), 2)
            logging.info(f'Running on {ctx} / free memory : {free_memory}GB / total memory {total_memory}GB')
        else:
            logging.info(f'Running on {ctx}')

    # 입력 사이즈를 32의 배수로 지정해 버리기 - stride가 일그러지는 것을 막기 위함
    if input_size[0] % 32 != 0 and input_size[1] % 32 != 0:
        logging.info("The input size must be a multiple of 32")
        exit(0)

    if GPU_COUNT > 0 and batch_size < GPU_COUNT:
        logging.info("batch size must be greater than gpu number")
        exit(0)

    if AMP:
        amp.init()

    if multiscale:
        logging.info("Using MultiScale")

    if data_augmentation:
        logging.info("Using Data Augmentation")

    logging.info("training YoloV3 Detector")
    input_shape = (1, 3) + tuple(input_size)

    try:
        net = Yolov3(Darknetlayer=Darknetlayer,
                     anchors=anchors,
                     pretrained=False,
                     ctx=mx.cpu())
        train_dataloader, train_dataset = traindataloader(multiscale=multiscale,
                                                          factor_scale=factor_scale,
                                                          augmentation=data_augmentation,
                                                          path=train_dataset_path,
                                                          input_size=input_size,
                                                          batch_size=batch_size,
                                                          batch_interval=batch_interval,
                                                          num_workers=num_workers,
                                                          shuffle=True, mean=mean, std=std,
                                                          net=net, ignore_threshold=ignore_threshold, dynamic=dynamic,
                                                          from_sigmoid=False, make_target=True)
        valid_dataloader, valid_dataset = validdataloader(path=valid_dataset_path,
                                                          input_size=input_size,
                                                          batch_size=valid_size,
                                                          num_workers=num_workers,
                                                          shuffle=True, mean=mean, std=std,
                                                          net=net, ignore_threshold=ignore_threshold, dynamic=dynamic,
                                                          from_sigmoid=False, make_target=True)

    except Exception:
        logging.info("dataset 없음")
        exit(0)

    train_update_number_per_epoch = len(train_dataloader)
    if train_update_number_per_epoch < 1:
        logging.warning("train batch size가 데이터 수보다 큼")
        exit(0)

    valid_list = glob.glob(os.path.join(valid_dataset_path, "*"))
    if valid_list:
        valid_update_number_per_epoch = len(valid_dataloader)
        if valid_update_number_per_epoch < 1:
            logging.warning("valid batch size가 데이터 수보다 큼")
            exit(0)

    num_classes = train_dataset.num_class  # 클래스 수
    name_classes = train_dataset.classes

    optimizer = optimizer.upper()
    if pretrained_base:
        model = str(input_size[0]) + "_" + str(input_size[1]) + "_" + optimizer + "_P" + "Dark_" + str(Darknetlayer)
    else:
        model = str(input_size[0]) + "_" + str(input_size[1]) + "_" + optimizer + "_Dark_" + str(Darknetlayer)

    weight_path = f"weights/{model}"
    sym_path = os.path.join(weight_path, f'{model}-symbol.json')
    param_path = os.path.join(weight_path, f'{model}-{load_period:04d}.params')

    if os.path.exists(param_path) and os.path.exists(sym_path):
        start_epoch = load_period
        logging.info(f"loading {os.path.basename(param_path)} weights\n")
        net = gluon.SymbolBlock.imports(sym_path,
                                        ['data'],
                                        param_path, ctx=ctx)
    else:
        start_epoch = 0
        '''
        mxnet c++에서 arbitrary input image 를 받기 위한 전략
        alloc_size : tuple of int, default is (128, 128)
        For advanced users. Define `alloc_size` to generate large enough offset
        maps, which will later saved in parameters. During inference, we support arbitrary
        input image by cropping corresponding area of the anchor map. This allow us
        to export to symbol so we can run it in c++, Scalar, etc.
        '''
        net = Yolov3(Darknetlayer=Darknetlayer,
                     input_size=input_size,
                     anchors=anchors,
                     num_classes=num_classes,  # foreground만
                     pretrained=pretrained_base,
                     pretrained_path=pretrained_path,
                     alloc_size=offset_alloc_size,
                     ctx=ctx)

        if isinstance(ctx, (list, tuple)):
            net.summary(mx.nd.ones(shape=input_shape, ctx=ctx[0]))
        else:
            net.summary(mx.nd.ones(shape=input_shape, ctx=ctx))

        '''
        active (bool, default True) – Whether to turn hybrid on or off.
        static_alloc (bool, default False) – Statically allocate memory to improve speed. Memory usage may increase.
        static_shape (bool, default False) – Optimize for invariant input shapes between iterations. Must also set static_alloc to True. Change of input shapes is still allowed but slower.
        '''
        if multiscale:
            net.hybridize(active=True, static_alloc=True, static_shape=False)
        else:
            net.hybridize(active=True, static_alloc=True, static_shape=True)

    if start_epoch + 1 >= epoch + 1:
        logging.info("this model has already been optimized")
        exit(0)

    if tensorboard:
        summary = SummaryWriter(logdir=os.path.join("mxboard", model), max_queue=10, flush_secs=10,
                                verbose=False)
        if isinstance(ctx, (list, tuple)):
            net.forward(mx.nd.ones(shape=input_shape, ctx=ctx[0]))
        else:
            net.forward(mx.nd.ones(shape=input_shape, ctx=ctx))
        summary.add_graph(net)
    if graphviz:
        gluoncv.utils.viz.plot_network(net, shape=input_shape, save_prefix=model)

    # optimizer
    unit = 1 if (len(train_dataset) // batch_size) < 1 else len(train_dataset) // batch_size
    step = unit * decay_step
    lr_sch = mx.lr_scheduler.FactorScheduler(step=step, factor=decay_lr, stop_factor_lr=1e-12, base_lr=learning_rate)

    for p in net.collect_params().values():
        if p.grad_req != "null":
            p.grad_req = 'add'

    if AMP:
        '''
        update_on_kvstore : bool, default None
        Whether to perform parameter updates on kvstore. If None, then trainer will choose the more
        suitable option depending on the type of kvstore. If the `update_on_kvstore` argument is
        provided, environment variable `MXNET_UPDATE_ON_KVSTORE` will be ignored.
        '''
        if optimizer.upper() == "ADAM":
            trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={"learning_rate": learning_rate,
                                                                                       "lr_scheduler": lr_sch,
                                                                                       "beta1": 0.9,
                                                                                       "beta2": 0.999,
                                                                                       'multi_precision': False},
                                    update_on_kvstore=False)  # for Dynamic loss scaling
        elif optimizer.upper() == "RMSPROP":
            trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={"learning_rate": learning_rate,
                                                                                       "lr_scheduler": lr_sch,
                                                                                       "gamma1": 0.9,
                                                                                       "gamma2": 0.999,
                                                                                       'multi_precision': False},
                                    update_on_kvstore=False)  # for Dynamic loss scaling
        elif optimizer.upper() == "SGD":
            trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={"learning_rate": learning_rate,
                                                                                       "lr_scheduler": lr_sch,
                                                                                       "wd": 0.0005,
                                                                                       "momentum": 0.9,
                                                                                       'multi_precision': False},
                                    update_on_kvstore=False)  # for Dynamic loss scaling
        else:
            logging.error("optimizer not selected")
            exit(0)

        amp.init_trainer(trainer)

    else:
        if optimizer.upper() == "ADAM":
            trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={"learning_rate": learning_rate,
                                                                                       "lr_scheduler": lr_sch,
                                                                                       "beta1": 0.9,
                                                                                       "beta2": 0.999,
                                                                                       'multi_precision': False})
        elif optimizer.upper() == "RMSPROP":
            trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={"learning_rate": learning_rate,
                                                                                       "lr_scheduler": lr_sch,
                                                                                       "gamma1": 0.9,
                                                                                       "gamma2": 0.999,
                                                                                       'multi_precision': False})
        elif optimizer.upper() == "SGD":
            trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={"learning_rate": learning_rate,
                                                                                       "lr_scheduler": lr_sch,
                                                                                       "wd": 0.0005,
                                                                                       "momentum": 0.9,
                                                                                       'multi_precision': False})

        else:
            logging.error("optimizer not selected")
            exit(0)

    loss = Yolov3Loss(sparse_label=True,
                      from_sigmoid=False,
                      batch_axis=None,
                      num_classes=num_classes,
                      reduction="sum",
                      exclude=False)

    prediction = Prediction(
        from_sigmoid=False,
        num_classes=num_classes,
        nms_thresh=nms_thresh,
        nms_topk=nms_topk,
        except_class_thresh=except_class_thresh,
        multiperclass=multiperclass)

    precision_recall = Voc_2007_AP(iou_thresh=iou_thresh, class_names=name_classes)

    start_time = time.time()
    for i in tqdm(range(start_epoch + 1, epoch + 1, 1), initial=start_epoch + 1, total=epoch):

        xcyc_loss_sum = 0
        wh_loss_sum = 0
        object_loss_sum = 0
        class_loss_sum = 0
        time_stamp = time.time()

        for batch_count, (image, _, xcyc_all, wh_all, objectness_all, class_all, weights_all, _) in enumerate(
                train_dataloader, start=1):
            td_batch_size = image.shape[0]

            image = mx.nd.split(data=image, num_outputs=subdivision, axis=0)
            xcyc_all = mx.nd.split(data=xcyc_all, num_outputs=subdivision, axis=0)
            wh_all = mx.nd.split(data=wh_all, num_outputs=subdivision, axis=0)
            objectness_all = mx.nd.split(data=objectness_all, num_outputs=subdivision, axis=0)
            class_all = mx.nd.split(data=class_all, num_outputs=subdivision, axis=0)
            weights_all = mx.nd.split(data=weights_all, num_outputs=subdivision, axis=0)

            if subdivision == 1:
                image = [image]
                xcyc_all = [xcyc_all]
                wh_all = [wh_all]
                objectness_all = [objectness_all]
                class_all = [class_all]
                weights_all = [weights_all]
            '''
            autograd 설명
            https://mxnet.apache.org/api/python/docs/tutorials/getting-started/crash-course/3-autograd.html
            '''
            with autograd.record(train_mode=True):

                xcyc_all_losses = []
                wh_all_losses = []
                object_all_losses = []
                class_all_losses = []

                for image_split, xcyc_split, wh_split, objectness_split, class_split, weights_split in zip(image,
                                                                                                           xcyc_all,
                                                                                                           wh_all,
                                                                                                           objectness_all,
                                                                                                           class_all,
                                                                                                           weights_all):

                    if GPU_COUNT <= 1:
                        image_split = gluon.utils.split_and_load(image_split, [ctx], even_split=False)
                        xcyc_split = gluon.utils.split_and_load(xcyc_split, [ctx], even_split=False)
                        wh_split = gluon.utils.split_and_load(wh_split, [ctx], even_split=False)
                        objectness_split = gluon.utils.split_and_load(objectness_split, [ctx], even_split=False)
                        class_split = gluon.utils.split_and_load(class_split, [ctx], even_split=False)
                        weights_split = gluon.utils.split_and_load(weights_split, [ctx], even_split=False)
                    else:
                        image_split = gluon.utils.split_and_load(image_split, ctx, even_split=False)
                        xcyc_split = gluon.utils.split_and_load(xcyc_split, ctx, even_split=False)
                        wh_split = gluon.utils.split_and_load(wh_split, ctx, even_split=False)
                        objectness_split = gluon.utils.split_and_load(objectness_split, ctx, even_split=False)
                        class_split = gluon.utils.split_and_load(class_split, ctx, even_split=False)
                        weights_split = gluon.utils.split_and_load(weights_split, ctx, even_split=False)

                    xcyc_losses = []
                    wh_losses = []
                    object_losses = []
                    class_losses = []
                    total_loss = []

                    # gpu N 개를 대비한 코드 (Data Parallelism)
                    for img, xcyc_target, wh_target, objectness, class_target, weights in zip(image_split, xcyc_split,
                                                                                              wh_split,
                                                                                              objectness_split,
                                                                                              class_split,
                                                                                              weights_split):
                        output1, output2, output3, anchor1, anchor2, anchor3, offset1, offset2, offset3, stride1, stride2, stride3 = net(
                            img)
                        xcyc_loss, wh_loss, object_loss, class_loss = loss(output1, output2, output3, xcyc_target,
                                                                           wh_target, objectness,
                                                                           class_target, weights)
                        xcyc_losses.append(xcyc_loss.asscalar())
                        wh_losses.append(wh_loss.asscalar())
                        object_losses.append(object_loss.asscalar())
                        class_losses.append(class_loss.asscalar())
                        total_loss.append(xcyc_loss + wh_loss + object_loss + class_loss)
                    if AMP:
                        with amp.scale_loss(total_loss, trainer) as scaled_loss:
                            autograd.backward(scaled_loss)
                    else:
                        autograd.backward(total_loss)

                    xcyc_all_losses.append(sum(xcyc_losses))
                    wh_all_losses.append(sum(wh_losses))
                    object_all_losses.append(sum(object_losses))
                    class_all_losses.append(sum(class_losses))

            trainer.step(batch_size=td_batch_size, ignore_stale_grad=False)
            # 비우기
            for p in net.collect_params().values():
                p.zero_grad()

            xcyc_loss_sum += sum(xcyc_all_losses) / td_batch_size
            wh_loss_sum += sum(wh_all_losses) / td_batch_size
            object_loss_sum += sum(object_all_losses) / td_batch_size
            class_loss_sum += sum(class_all_losses) / td_batch_size

            if batch_count % batch_log == 0:
                logging.info(f'[Epoch {i}][Batch {batch_count}/{train_update_number_per_epoch}],'
                             f'[Speed {td_batch_size / (time.time() - time_stamp):.3f} samples/sec],'
                             f'[Lr = {trainer.learning_rate}]'
                             f'[xcyc loss = {sum(xcyc_all_losses) / td_batch_size:.3f}]'
                             f'[wh loss = {sum(wh_all_losses) / td_batch_size:.3f}]'
                             f'[obj loss = {sum(object_all_losses) / td_batch_size:.3f}]'
                             f'[class loss = {sum(class_all_losses) / td_batch_size:.3f}]')
            time_stamp = time.time()

        train_xcyc_loss_mean = np.divide(xcyc_loss_sum, train_update_number_per_epoch)
        train_wh_loss_mean = np.divide(wh_loss_sum, train_update_number_per_epoch)
        train_object_loss_mean = np.divide(object_loss_sum, train_update_number_per_epoch)
        train_class_loss_mean = np.divide(class_loss_sum, train_update_number_per_epoch)
        train_total_loss_mean = train_xcyc_loss_mean + train_wh_loss_mean + train_object_loss_mean + train_class_loss_mean
        logging.info(
            f"train xcyc loss : {train_xcyc_loss_mean} / "
            f"train wh loss : {train_wh_loss_mean} / "
            f"train object loss : {train_object_loss_mean} / "
            f"train class loss : {train_class_loss_mean} / "
            f"train total loss : {train_total_loss_mean}"
        )

        if i % eval_period == 0 and valid_list:

            xcyc_loss_sum = 0
            wh_loss_sum = 0
            object_loss_sum = 0
            class_loss_sum = 0

            # loss 구하기
            for image, label, xcyc_all, wh_all, objectness_all, class_all, weights_all, _ in valid_dataloader:
                vd_batch_size, _, height, width = image.shape

                if GPU_COUNT <= 1:
                    image = gluon.utils.split_and_load(image, [ctx], even_split=False)
                    label = gluon.utils.split_and_load(label, [ctx], even_split=False)
                    xcyc_all = gluon.utils.split_and_load(xcyc_all, [ctx], even_split=False)
                    wh_all = gluon.utils.split_and_load(wh_all, [ctx], even_split=False)
                    objectness_all = gluon.utils.split_and_load(objectness_all, [ctx], even_split=False)
                    class_all = gluon.utils.split_and_load(class_all, [ctx], even_split=False)
                    weights_all = gluon.utils.split_and_load(weights_all, [ctx], even_split=False)
                else:
                    image = gluon.utils.split_and_load(image, ctx, even_split=False)
                    label = gluon.utils.split_and_load(label, ctx, even_split=False)
                    xcyc_all = gluon.utils.split_and_load(xcyc_all, ctx, even_split=False)
                    wh_all = gluon.utils.split_and_load(wh_all, ctx, even_split=False)
                    objectness_all = gluon.utils.split_and_load(objectness_all, ctx, even_split=False)
                    class_all = gluon.utils.split_and_load(class_all, ctx, even_split=False)
                    weights_all = gluon.utils.split_and_load(weights_all, ctx, even_split=False)

                xcyc_losses = []
                wh_losses = []
                object_losses = []
                class_losses = []
                total_loss = []

                # gpu N 개를 대비한 코드 (Data Parallelism)
                for img, lb, xcyc_target, wh_target, objectness, class_target, weights in zip(image, label, xcyc_all,
                                                                                              wh_all, objectness_all,
                                                                                              class_all, weights_all):
                    gt_box = lb[:, :, :4]
                    gt_id = lb[:, :, 4:5]

                    output1, output2, output3, anchor1, anchor2, anchor3, offset1, offset2, offset3, stride1, stride2, stride3 = net(
                        img)
                    id, score, bbox = prediction(output1, output2, output3, anchor1, anchor2, anchor3, offset1, offset2,
                                                 offset3, stride1, stride2, stride3)

                    precision_recall.update(pred_bboxes=bbox,
                                            pred_labels=id,
                                            pred_scores=score,
                                            gt_boxes=gt_box,
                                            gt_labels=gt_id)

                    xcyc_loss, wh_loss, object_loss, class_loss = loss(output1, output2, output3, xcyc_target,
                                                                       wh_target, objectness,
                                                                       class_target, weights)
                    xcyc_losses.append(xcyc_loss.asscalar())
                    wh_losses.append(wh_loss.asscalar())
                    object_losses.append(object_loss.asscalar())
                    class_losses.append(class_loss.asscalar())
                    total_loss.append(xcyc_losses + wh_losses + object_losses + class_losses)

                xcyc_loss_sum += sum(xcyc_losses) / vd_batch_size
                wh_loss_sum += sum(wh_losses) / vd_batch_size
                object_loss_sum += sum(object_losses) / vd_batch_size
                class_loss_sum += sum(class_losses) / vd_batch_size

            valid_xcyc_loss_mean = np.divide(xcyc_loss_sum, valid_update_number_per_epoch)
            valid_wh_loss_mean = np.divide(wh_loss_sum, valid_update_number_per_epoch)
            valid_object_loss_mean = np.divide(object_loss_sum, valid_update_number_per_epoch)
            valid_class_loss_mean = np.divide(class_loss_sum, valid_update_number_per_epoch)
            valid_total_loss_mean = valid_xcyc_loss_mean + valid_wh_loss_mean + valid_object_loss_mean + valid_class_loss_mean

            logging.info(
                f"valid xcyc loss : {valid_xcyc_loss_mean} / "
                f"valid wh loss : {valid_wh_loss_mean} / "
                f"valid object loss : {valid_object_loss_mean} / "
                f"valid class loss : {valid_class_loss_mean} / "
                f"valid total loss : {valid_total_loss_mean}"
            )

            AP_appender = []
            round_position = 2
            class_name, precision, recall, true_positive, false_positive, threshold = precision_recall.get_PR_list()
            for j, c, p, r in zip(range(len(recall)), class_name, precision, recall):
                name, AP = precision_recall.get_AP(c, p, r)
                logging.info(f"class {j}'s {name} AP : {round(AP * 100, round_position)}%")
                AP_appender.append(AP)
            mAP_result = np.mean(AP_appender)

            logging.info(f"mAP : {round(mAP_result * 100, round_position)}%")
            precision_recall.get_PR_curve(name=class_name,
                                          precision=precision,
                                          recall=recall,
                                          threshold=threshold,
                                          AP=AP_appender, mAP=mAP_result, folder_name=valid_graph_path, epoch=i)
            precision_recall.reset()

            if tensorboard:
                # gpu N 개를 대비한 코드 (Data Parallelism)
                dataloader_iter = iter(valid_dataloader)
                image, label, _, _, _, _, _, _ = next(dataloader_iter)
                if GPU_COUNT <= 1:
                    image = gluon.utils.split_and_load(image, [ctx], even_split=False)
                    label = gluon.utils.split_and_load(label, [ctx], even_split=False)
                else:
                    image = gluon.utils.split_and_load(image, ctx, even_split=False)
                    label = gluon.utils.split_and_load(label, ctx, even_split=False)

                ground_truth_colors = {}
                for k in range(num_classes):
                    ground_truth_colors[k] = (0, 0, 1)

                batch_image = []
                for img, lb in zip(image, label):
                    gt_boxes = lb[:, :, :4]
                    gt_ids = lb[:, :, 4:5]
                    output1, output2, output3, anchor1, anchor2, anchor3, offset1, offset2, offset3, stride1, stride2, stride3 = net(
                        img)
                    ids, scores, bboxes = prediction(output1, output2, output3, anchor1, anchor2, anchor3, offset1,
                                                     offset2, offset3, stride1, stride2, stride3)

                    for ig, gt_id, gt_box, id, score, bbox in zip(img, gt_ids, gt_boxes, ids, scores, bboxes):
                        ig = ig.transpose(
                            (1, 2, 0)) * mx.nd.array(std, ctx=ig.context) + mx.nd.array(mean, ctx=ig.context)
                        ig = (ig * 255).clip(0, 255)

                        # ground truth box 그리기
                        ground_truth = plot_bbox(ig, gt_box, scores=None, labels=gt_id, thresh=None,
                                                 reverse_rgb=True,
                                                 class_names=valid_dataset.classes, absolute_coordinates=True,
                                                 colors=ground_truth_colors)
                        # prediction box 그리기
                        prediction_box = plot_bbox(ground_truth, bbox, scores=score, labels=id,
                                                   thresh=plot_class_thresh,
                                                   reverse_rgb=False,
                                                   class_names=valid_dataset.classes, absolute_coordinates=True)

                        # Tensorboard에 그리기 위해 BGR -> RGB / (height, width, channel) -> (channel, height, width) 를한다.
                        prediction_box = cv2.cvtColor(prediction_box, cv2.COLOR_BGR2RGB)
                        prediction_box = np.transpose(prediction_box,
                                                      axes=(2, 0, 1))
                        batch_image.append(prediction_box)  # (batch, channel, height, width)

                summary.add_image(tag="valid_result", image=np.array(batch_image), global_step=i)

                summary.add_scalar(tag="xy_loss", value={"train_xcyc_loss": train_xcyc_loss_mean,
                                                         "valid_xcyc_loss": valid_xcyc_loss_mean}, global_step=i)
                summary.add_scalar(tag="wh_loss", value={"train_wh_loss": train_wh_loss_mean,
                                                         "valid_wh_loss": valid_wh_loss_mean}, global_step=i)
                summary.add_scalar(tag="object_loss", value={"train_object_loss": train_object_loss_mean,
                                                             "valid_object_loss": valid_object_loss_mean},
                                   global_step=i)
                summary.add_scalar(tag="class_loss", value={"train_class_loss": train_class_loss_mean,
                                                            "valid_class_loss": valid_class_loss_mean}, global_step=i)

                summary.add_scalar(tag="total_loss", value={
                    "train_total_loss": train_total_loss_mean,
                    "valid_total_loss": valid_total_loss_mean},
                                   global_step=i)

                params = net.collect_params().values()
                if GPU_COUNT > 1:
                    for c in ctx:
                        for p in params:
                            summary.add_histogram(tag=p.name, values=p.data(ctx=c), global_step=i, bins='default')
                else:
                    for p in params:
                        summary.add_histogram(tag=p.name, values=p.data(), global_step=i, bins='default')

        if i % save_period == 0:

            weight_epoch_path = os.path.join(weight_path, str(i))
            if not os.path.exists(weight_epoch_path):
                os.makedirs(weight_epoch_path)

            '''
            Hybrid models can be serialized as JSON files using the export function
            Export HybridBlock to json format that can be loaded by SymbolBlock.imports, mxnet.mod.Module or the C++ interface.
            When there are only one input, it will have name data. When there Are more than one inputs, they will be named as data0, data1, etc.
            '''

            if GPU_COUNT >= 1:
                context = mx.gpu(0)
            else:
                context = mx.cpu(0)

            postnet = PostNet(net=net, auxnet=prediction)

            try:
                net.export(os.path.join(weight_path, f"{model}"), epoch=i, remove_amp_cast=True)  # for onnx
                net.save_parameters(os.path.join(weight_path, f"{i}.params"))  # onnx 추출용
                # network inference, decoder, nms까지 처리됨 - mxnet c++에서 편리함 / onnx로는 추출 못함.
                export_block_for_cplusplus(path=os.path.join(weight_epoch_path, f"{model}_prepost"),
                                           block=postnet,
                                           data_shape=tuple(input_size) + tuple((3,)),
                                           epoch=i,
                                           preprocess=True,  # c++ 에서 inference시 opencv에서 읽은 이미지 그대로 넣으면 됨
                                           layout='HWC',
                                           ctx=context,
                                           remove_amp_cast=True)

            except Exception as E:
                logging.error(f"json, param model export 예외 발생 : {E}")
            else:
                logging.info("json, param model export 성공")
                net.collect_params().reset_ctx(ctx)

    end_time = time.time()
    learning_time = end_time - start_time
    logging.info(f"learning time : 약, {learning_time / 3600:0.2f}H")
    logging.info("optimization completed")

    if using_mlflow:
        ml.log_metric("learning time", round(learning_time / 3600, 2))
Ejemplo n.º 3
0
def run(mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225],
        anchor_alloc_size=[256, 256],
        box_sizes=[21, 51.2, 133.12, 215.04, 296.96, 378.88, 460.8, 542.72],
        box_ratios=[[1, 2, 0.5]] + [[1, 2, 0.5, 3, 1.0 / 3]] * 4 +
        [[1, 2, 0.5]] * 2,
        anchor_box_clip=True,
        graphviz=True,
        epoch=100,
        input_size=[400, 600],
        batch_log=100,
        batch_size=16,
        batch_interval=10,
        subdivision=4,
        train_dataset_path="Dataset/train",
        valid_dataset_path="Dataset/valid",
        multiscale=True,
        factor_scale=[8, 5],
        foreground_iou_thresh=0.5,
        data_augmentation=True,
        num_workers=4,
        optimizer="ADAM",
        save_period=10,
        load_period=10,
        learning_rate=0.001,
        decay_lr=0.999,
        decay_step=10,
        GPU_COUNT=0,
        base="VGG16_512",
        pretrained_base=True,
        pretrained_path="modelparam",
        classHardNegativeMining=True,
        boxHardNegativeMining=True,
        AMP=True,
        valid_size=8,
        eval_period=5,
        tensorboard=True,
        valid_graph_path="valid_Graph",
        using_mlflow=True,
        decode_number=-1,
        multiperclass=True,
        nms_thresh=0.45,
        nms_topk=500,
        iou_thresh=0.5,
        except_class_thresh=0.01,
        plot_class_thresh=0.5):
    if GPU_COUNT == 0:
        ctx = mx.cpu(0)
        AMP = False
    elif GPU_COUNT == 1:
        ctx = mx.gpu(0)
    else:
        ctx = [mx.gpu(i) for i in range(GPU_COUNT)]

    # 운영체제 확인
    if platform.system() == "Linux":
        logging.info(f"{platform.system()} OS")
    elif platform.system() == "Windows":
        logging.info(f"{platform.system()} OS")
    else:
        logging.info(f"{platform.system()} OS")

    if isinstance(ctx, (list, tuple)):
        for i, c in enumerate(ctx):
            free_memory, total_memory = mx.context.gpu_memory_info(i)
            free_memory = round(free_memory / (1024 * 1024 * 1024), 2)
            total_memory = round(total_memory / (1024 * 1024 * 1024), 2)
            logging.info(
                f'Running on {c} / free memory : {free_memory}GB / total memory {total_memory}GB'
            )
    else:
        if GPU_COUNT == 1:
            free_memory, total_memory = mx.context.gpu_memory_info(0)
            free_memory = round(free_memory / (1024 * 1024 * 1024), 2)
            total_memory = round(total_memory / (1024 * 1024 * 1024), 2)
            logging.info(
                f'Running on {ctx} / free memory : {free_memory}GB / total memory {total_memory}GB'
            )
        else:
            logging.info(f'Running on {ctx}')

    if GPU_COUNT > 0 and batch_size < GPU_COUNT:
        logging.info("batch size must be greater than gpu number")
        exit(0)

    if AMP:
        amp.init()

    if multiscale:
        logging.info("Using MultiScale")

    if data_augmentation:
        logging.info("Using Data Augmentation")

    logging.info("training SSD Detector")
    input_shape = (1, 3) + tuple(input_size)

    try:
        if base.upper() == "VGG16_300":  # 입력 사이즈 300 x 300 추천
            net = SSD_VGG16(version=300,
                            input_size=input_size,
                            box_sizes=box_sizes,
                            box_ratios=box_ratios,
                            anchor_box_clip=anchor_box_clip,
                            alloc_size=anchor_alloc_size,
                            ctx=mx.cpu())
        elif base.upper() == "VGG16_512":  # 입력 사이즈 512 x 512 추천
            net = SSD_VGG16(version=512,
                            input_size=input_size,
                            box_sizes=box_sizes,
                            box_ratios=box_ratios,
                            anchor_box_clip=anchor_box_clip,
                            ctx=mx.cpu())
        train_dataloader, train_dataset = traindataloader(
            multiscale=multiscale,
            factor_scale=factor_scale,
            augmentation=data_augmentation,
            path=train_dataset_path,
            input_size=input_size,
            batch_size=batch_size,
            batch_interval=batch_interval,
            num_workers=num_workers,
            shuffle=True,
            mean=mean,
            std=std,
            net=net,
            foreground_iou_thresh=foreground_iou_thresh,
            make_target=True)
        valid_dataloader, valid_dataset = validdataloader(
            path=valid_dataset_path,
            input_size=input_size,
            batch_size=valid_size,
            num_workers=num_workers,
            shuffle=True,
            mean=mean,
            std=std,
            net=net,
            foreground_iou_thresh=foreground_iou_thresh,
            make_target=True)
    except Exception:
        logging.info("dataset 없음")
        exit(0)

    train_update_number_per_epoch = len(train_dataloader)
    if train_update_number_per_epoch < 1:
        logging.warning("train batch size가 데이터 수보다 큼")
        exit(0)

    valid_list = glob.glob(os.path.join(valid_dataset_path, "*"))
    if valid_list:
        valid_update_number_per_epoch = len(valid_dataloader)
        if valid_update_number_per_epoch < 1:
            logging.warning("valid batch size가 데이터 수보다 큼")
            exit(0)

    num_classes = train_dataset.num_class  # 클래스 수
    name_classes = train_dataset.classes

    # 이름 다시 붙이기
    optimizer = optimizer.upper()
    base = base.upper()
    if pretrained_base:
        model = str(input_size[0]) + "_" + str(
            input_size[1]) + "_" + optimizer + "_P" + base
    else:
        model = str(input_size[0]) + "_" + str(
            input_size[1]) + "_" + optimizer + "_" + base

    weight_path = f"weights/{model}"
    sym_path = os.path.join(weight_path, f'{model}-symbol.json')
    param_path = os.path.join(weight_path, f'{model}-{load_period:04d}.params')

    if os.path.exists(param_path) and os.path.exists(sym_path):
        start_epoch = load_period
        logging.info(f"loading {os.path.basename(param_path)} weights\n")
        net = gluon.SymbolBlock.imports(sym_path, ['data'],
                                        param_path,
                                        ctx=ctx)
    else:
        start_epoch = 0
        if base.upper() == "VGG16_300":  # 입력 사이즈 300 x 300 추천
            net = SSD_VGG16(
                version=300,
                input_size=input_size,
                # box_sizes=[21, 45, 101.25, 157.5, 213.75, 270, 326.25],
                # box_ratios=[[1, 2, 0.5]] +  # conv4_3
                #            [[1, 2, 0.5, 3, 1.0 / 3]] * 3 +  # conv7, conv8_2, conv9_2, conv10_2
                #            [[1, 2, 0.5]] * 2,  # conv11_2, conv12_2
                box_sizes=box_sizes,
                box_ratios=box_ratios,
                num_classes=num_classes,
                pretrained=pretrained_base,
                pretrained_path=pretrained_path,
                anchor_box_clip=anchor_box_clip,
                alloc_size=anchor_alloc_size,
                ctx=ctx)

        elif base.upper() == "VGG16_512":  # 입력 사이즈 512 x 512 추천
            net = SSD_VGG16(
                version=512,
                input_size=input_size,
                # box_sizes=[21, 51.2, 133.12, 215.04, 296.96, 378.88, 460.8, 542.72],
                # box_ratios=[[1, 2, 0.5]] +  # conv4_3
                #            [[1, 2, 0.5, 3, 1.0 / 3]] * 4 +  # conv7, conv8_2, conv9_2, conv10_2
                #            [[1, 2, 0.5]] * 2,  # conv11_2, conv12_2
                box_sizes=box_sizes,
                box_ratios=box_ratios,
                num_classes=num_classes,
                pretrained=pretrained_base,
                pretrained_path=pretrained_path,
                anchor_box_clip=anchor_box_clip,
                ctx=ctx)
        else:
            logging.warning("backbone 없음")
            exit(0)

        if isinstance(ctx, (list, tuple)):
            net.summary(mx.nd.ones(shape=input_shape, ctx=ctx[0]))
        else:
            net.summary(mx.nd.ones(shape=input_shape, ctx=ctx))
        '''
        active (bool, default True) – Whether to turn hybrid on or off.
        static_alloc (bool, default False) – Statically allocate memory to improve speed. Memory usage may increase.
        static_shape (bool, default False) – Optimize for invariant input shapes between iterations. Must also set static_alloc to True. Change of input shapes is still allowed but slower.
        '''
        if multiscale:
            net.hybridize(active=True, static_alloc=True, static_shape=False)
        else:
            net.hybridize(active=True, static_alloc=True, static_shape=True)

    if start_epoch + 1 >= epoch + 1:
        logging.info("this model has already been optimized")
        exit(0)

    if tensorboard:
        summary = SummaryWriter(logdir=os.path.join("mxboard", model),
                                max_queue=10,
                                flush_secs=10,
                                verbose=False)
        if isinstance(ctx, (list, tuple)):
            net.forward(mx.nd.ones(shape=input_shape, ctx=ctx[0]))
        else:
            net.forward(mx.nd.ones(shape=input_shape, ctx=ctx))
        summary.add_graph(net)

    if graphviz:
        gluoncv.utils.viz.plot_network(net,
                                       shape=input_shape,
                                       save_prefix=model)

    # optimizer
    unit = 1 if (len(train_dataset) //
                 batch_size) < 1 else len(train_dataset) // batch_size
    step = unit * decay_step
    lr_sch = mx.lr_scheduler.FactorScheduler(step=step,
                                             factor=decay_lr,
                                             stop_factor_lr=1e-12,
                                             base_lr=learning_rate)

    for p in net.collect_params().values():
        if p.grad_req != "null":
            p.grad_req = 'add'

    if AMP:
        '''
        update_on_kvstore : bool, default None
        Whether to perform parameter updates on kvstore. If None, then trainer will choose the more
        suitable option depending on the type of kvstore. If the `update_on_kvstore` argument is
        provided, environment variable `MXNET_UPDATE_ON_KVSTORE` will be ignored.
        '''
        if optimizer.upper() == "ADAM":
            trainer = gluon.Trainer(
                net.collect_params(),
                optimizer,
                optimizer_params={
                    "learning_rate": learning_rate,
                    "lr_scheduler": lr_sch,
                    "beta1": 0.9,
                    "beta2": 0.999,
                    'multi_precision': False
                },
                update_on_kvstore=False)  # for Dynamic loss scaling
        elif optimizer.upper() == "RMSPROP":
            trainer = gluon.Trainer(
                net.collect_params(),
                optimizer,
                optimizer_params={
                    "learning_rate": learning_rate,
                    "lr_scheduler": lr_sch,
                    "gamma1": 0.9,
                    "gamma2": 0.999,
                    'multi_precision': False
                },
                update_on_kvstore=False)  # for Dynamic loss scaling
        elif optimizer.upper() == "SGD":
            trainer = gluon.Trainer(
                net.collect_params(),
                optimizer,
                optimizer_params={
                    "learning_rate": learning_rate,
                    "lr_scheduler": lr_sch,
                    "wd": 0.0005,
                    "momentum": 0.9,
                    'multi_precision': False
                },
                update_on_kvstore=False)  # for Dynamic loss scaling
        else:
            logging.error("optimizer not selected")
            exit(0)

        amp.init_trainer(trainer)

    else:
        if optimizer.upper() == "ADAM":
            trainer = gluon.Trainer(net.collect_params(),
                                    optimizer,
                                    optimizer_params={
                                        "learning_rate": learning_rate,
                                        "lr_scheduler": lr_sch,
                                        "beta1": 0.9,
                                        "beta2": 0.999,
                                        'multi_precision': False
                                    })
        elif optimizer.upper() == "RMSPROP":
            trainer = gluon.Trainer(net.collect_params(),
                                    optimizer,
                                    optimizer_params={
                                        "learning_rate": learning_rate,
                                        "lr_scheduler": lr_sch,
                                        "gamma1": 0.9,
                                        "gamma2": 0.999,
                                        'multi_precision': False
                                    })
        elif optimizer.upper() == "SGD":
            trainer = gluon.Trainer(net.collect_params(),
                                    optimizer,
                                    optimizer_params={
                                        "learning_rate": learning_rate,
                                        "lr_scheduler": lr_sch,
                                        "wd": 0.0005,
                                        "momentum": 0.9,
                                        'multi_precision': False
                                    })

        else:
            logging.error("optimizer not selected")
            exit(0)
    '''
    localization loss -> Smooth L1 loss
    confidence loss -> Softmax
    '''
    if not classHardNegativeMining:
        confidence_loss = SoftmaxCrossEntropyLoss(axis=-1,
                                                  sparse_label=True,
                                                  from_log_softmax=False,
                                                  batch_axis=None,
                                                  reduction="sum",
                                                  exclude=False)
    if not boxHardNegativeMining:
        localization_loss = HuberLoss(rho=1,
                                      batch_axis=None,
                                      reduction="sum",
                                      exclude=False)

    prediction = Prediction(from_softmax=False,
                            num_classes=num_classes,
                            decode_number=decode_number,
                            nms_thresh=nms_thresh,
                            nms_topk=nms_topk,
                            except_class_thresh=except_class_thresh,
                            multiperclass=multiperclass)

    precision_recall = Voc_2007_AP(iou_thresh=iou_thresh,
                                   class_names=name_classes)

    start_time = time.time()

    for i in tqdm(range(start_epoch + 1, epoch + 1, 1),
                  initial=start_epoch + 1,
                  total=epoch):

        conf_loss_sum = 0
        loc_loss_sum = 0
        time_stamp = time.time()

        for batch_count, (image, _, cls_all, box_all,
                          _) in enumerate(train_dataloader, start=1):
            td_batch_size = image.shape[0]

            image = mx.nd.split(data=image, num_outputs=subdivision, axis=0)
            cls_all = mx.nd.split(data=cls_all,
                                  num_outputs=subdivision,
                                  axis=0)
            box_all = mx.nd.split(data=box_all,
                                  num_outputs=subdivision,
                                  axis=0)

            if subdivision == 1:
                image = [image]
                cls_t_all = [cls_t_all]
                box_t_all = [box_t_all]

            with autograd.record(train_mode=True):

                cls_all_losses = []
                box_all_losses = []

                for image_split, cls_split, box_split in zip(
                        image, cls_all, box_all):

                    if GPU_COUNT <= 1:
                        image_split = gluon.utils.split_and_load(
                            image_split, [ctx], even_split=False)
                        cls_split = gluon.utils.split_and_load(
                            cls_split, [ctx], even_split=False)
                        box_split = gluon.utils.split_and_load(
                            box_split, [ctx], even_split=False)
                    else:
                        image_split = gluon.utils.split_and_load(
                            image_split, ctx, even_split=False)
                        cls_split = gluon.utils.split_and_load(
                            cls_split, ctx, even_split=False)
                        box_split = gluon.utils.split_and_load(
                            box_split, ctx, even_split=False)

                    # prediction, target space for Data Parallelism
                    cls_losses = []
                    box_losses = []
                    total_loss = []

                    # gpu N 개를 대비한 코드 (Data Parallelism)
                    for img, cls_target, box_target in zip(
                            image_split, cls_split, box_split):
                        # 1. SSD network Inference
                        cls_pred, box_pred, anchor = net(img)
                        '''
                            4. Hard negative mining (class에만 loss 계산)
                            Hard negative mining After the matching step, most of the default boxes are negatives,
                            especially when the number of possible default boxes is large. This introduces a
                            significant imbalance between the positive and negative training examples. Instead of
                            using all the negative examples, we sort them using the highest confidence loss for each
                            default box and pick the top ones so that the ratio between the negatives and positives is
                            at most 3:1. We found that this leads to faster optimization and a more stable training
                        '''
                        weight_term_alpha = 1
                        negative_mining_ratio = 3
                        positive_samples = cls_target > 0  # True or False
                        positive_numbers = positive_samples.sum()
                        if classHardNegativeMining:
                            pred = mx.nd.log_softmax(cls_pred, axis=-1)
                            negative_samples = 1 - positive_samples
                            conf_loss = -mx.nd.pick(
                                pred, cls_target,
                                axis=-1)  # (batch, all feature number)
                            '''
                            we sort them using the highest confidence loss for each
                            default box and pick the top ones so that the ratio between the negatives and positives is
                            at most 3:1.
                            '''
                            negative_samples_conf_loss = (conf_loss *
                                                          negative_samples)
                            # 아래 3줄의 코드 출처 : from gluoncv.loss import SSDMultiBoxLoss
                            negative_samples_index = mx.nd.argsort(
                                negative_samples_conf_loss,
                                axis=-1,
                                is_ascend=False)
                            selection = mx.nd.argsort(negative_samples_index,
                                                      axis=-1,
                                                      is_ascend=True)
                            hard_negative_samples = selection <= mx.nd.multiply(
                                positive_numbers,
                                negative_mining_ratio).expand_dims(-1)
                            pos_hardnega = positive_samples + hard_negative_samples
                            conf_loss = mx.nd.where(
                                pos_hardnega > 0, conf_loss,
                                mx.nd.zeros_like(conf_loss))
                            conf_loss = mx.nd.sum(conf_loss)
                            if positive_numbers:
                                conf_loss = mx.nd.divide(
                                    conf_loss, positive_numbers)
                            else:
                                conf_loss = mx.nd.multiply(conf_loss, 0)
                            cls_losses.append(conf_loss.asscalar())
                        else:
                            conf_loss = confidence_loss(
                                cls_pred, cls_target,
                                positive_samples.expand_dims(axis=-1))
                            if positive_numbers:
                                conf_loss = mx.nd.divide(
                                    conf_loss, positive_numbers)
                            else:
                                conf_loss = mx.nd.multiply(conf_loss, 0)
                            cls_losses.append(conf_loss.asscalar())

                        if boxHardNegativeMining:
                            # loc loss에도 hard HardNegativeMining 적용해보자.
                            pred = mx.nd.log_softmax(cls_pred, axis=-1)
                            negative_samples = 1 - positive_samples
                            conf_loss_for_box = -mx.nd.pick(
                                pred, cls_target,
                                axis=-1)  # (batch, all feature number)
                            negative_samples_conf_loss = (conf_loss_for_box *
                                                          negative_samples)
                            negative_samples_index = mx.nd.argsort(
                                negative_samples_conf_loss,
                                axis=-1,
                                is_ascend=False)
                            selection = mx.nd.argsort(negative_samples_index,
                                                      axis=-1,
                                                      is_ascend=True)
                            hard_negative_samples = selection <= mx.nd.multiply(
                                positive_numbers,
                                negative_mining_ratio).expand_dims(-1)
                            pos_hardnega = positive_samples + hard_negative_samples
                            pos_hardnega = mx.nd.repeat(
                                pos_hardnega.reshape(shape=(0, 0, 1)),
                                repeats=4,
                                axis=-1)

                            loc_loss = mx.nd.abs(box_pred - box_target)
                            loc_loss = mx.nd.where(loc_loss > 1,
                                                   loc_loss - 0.5, (0.5 / 1) *
                                                   mx.nd.square(loc_loss))
                            loc_loss = mx.nd.where(pos_hardnega > 0, loc_loss,
                                                   mx.nd.zeros_like(loc_loss))
                            loc_loss = mx.nd.sum(loc_loss)
                            if positive_numbers:
                                loc_loss = mx.nd.divide(
                                    loc_loss, positive_numbers)
                            else:
                                loc_loss = mx.nd.multiply(loc_loss, 0)
                            box_losses.append(loc_loss.asscalar())
                        else:
                            loc_loss = localization_loss(
                                box_pred, box_target,
                                positive_samples.expand_dims(axis=-1))
                            if positive_numbers:
                                loc_loss = mx.nd.divide(
                                    loc_loss, positive_numbers)
                            else:
                                loc_loss = mx.nd.multiply(loc_loss, 0)
                            box_losses.append(loc_loss.asscalar())

                        total_loss.append(conf_loss +
                                          weight_term_alpha * loc_loss)
                    if AMP:
                        with amp.scale_loss(total_loss,
                                            trainer) as scaled_loss:
                            autograd.backward(scaled_loss)
                    else:
                        autograd.backward(total_loss)

                    cls_all_losses.append(sum(cls_losses))
                    box_all_losses.append(sum(box_losses))

            trainer.step(batch_size=td_batch_size, ignore_stale_grad=False)
            # 비우기
            for p in net.collect_params().values():
                p.zero_grad()

            conf_loss_sum += sum(cls_all_losses) / td_batch_size
            loc_loss_sum += sum(box_all_losses) / td_batch_size

            if batch_count % batch_log == 0:
                logging.info(
                    f'[Epoch {i}][Batch {batch_count}/{train_update_number_per_epoch}],'
                    f'[Speed {td_batch_size / (time.time() - time_stamp):.3f} samples/sec],'
                    f'[Lr = {trainer.learning_rate}]'
                    f'[confidence loss = {sum(cls_all_losses) / td_batch_size:.3f}]'
                    f'[localization loss = {sum(box_all_losses) / td_batch_size:.3f}]'
                )
            time_stamp = time.time()

        train_conf_loss_mean = np.divide(conf_loss_sum,
                                         train_update_number_per_epoch)
        train_loc_loss_mean = np.divide(loc_loss_sum,
                                        train_update_number_per_epoch)
        train_total_loss_mean = train_conf_loss_mean + train_loc_loss_mean

        logging.info(
            f"train confidence loss : {train_conf_loss_mean} / train localization loss : {train_loc_loss_mean} / train total loss : {train_total_loss_mean}"
        )

        if i % eval_period == 0 and valid_list:

            if classHardNegativeMining:
                confidence_loss = SoftmaxCrossEntropyLoss(
                    axis=-1,
                    sparse_label=True,
                    from_log_softmax=False,
                    batch_axis=None,
                    reduction="sum",
                    exclude=False)
            if boxHardNegativeMining:
                localization_loss = HuberLoss(rho=1,
                                              batch_axis=None,
                                              reduction="sum",
                                              exclude=False)

            conf_loss_sum = 0
            loc_loss_sum = 0
            for image, label, cls_all, box_all, _ in valid_dataloader:

                vd_batch_size = image.shape[0]
                if GPU_COUNT <= 1:
                    image = gluon.utils.split_and_load(image, [ctx],
                                                       even_split=False)
                    label = gluon.utils.split_and_load(label, [ctx],
                                                       even_split=False)
                    cls_all = gluon.utils.split_and_load(cls_all, [ctx],
                                                         even_split=False)
                    box_all = gluon.utils.split_and_load(box_all, [ctx],
                                                         even_split=False)
                else:
                    image = gluon.utils.split_and_load(image,
                                                       ctx,
                                                       even_split=False)
                    label = gluon.utils.split_and_load(label,
                                                       ctx,
                                                       even_split=False)
                    cls_all = gluon.utils.split_and_load(cls_all, [ctx],
                                                         even_split=False)
                    box_all = gluon.utils.split_and_load(box_all, [ctx],
                                                         even_split=False)

                # prediction, target space for Data Parallelism
                cls_losses = []
                box_losses = []

                # gpu N 개를 대비한 코드 (Data Parallelism)
                for img, lb, cls_target, box_target in zip(
                        image, label, cls_all, box_all):
                    gt_box = lb[:, :, :4]
                    gt_id = lb[:, :, 4:5]
                    cls_pred, box_pred, anchor = net(img)
                    id, score, bbox = prediction(cls_pred, box_pred, anchor)

                    precision_recall.update(pred_bboxes=bbox,
                                            pred_labels=id,
                                            pred_scores=score,
                                            gt_boxes=gt_box,
                                            gt_labels=gt_id)

                    positive_samples = cls_target > 0
                    positive_numbers = positive_samples.sum()

                    conf_loss = confidence_loss(
                        cls_pred, cls_target,
                        positive_samples.expand_dims(axis=-1))
                    if positive_numbers:
                        conf_loss = mx.nd.divide(conf_loss, positive_numbers)
                    else:
                        conf_loss = mx.nd.multiply(conf_loss, 0)
                    cls_losses.append(conf_loss.asscalar())

                    loc_loss = localization_loss(
                        box_pred, box_target,
                        positive_samples.expand_dims(axis=-1))
                    if positive_numbers:
                        loc_loss = mx.nd.divide(loc_loss, positive_numbers)
                    else:
                        loc_loss = mx.nd.multiply(loc_loss, 0)
                    box_losses.append(loc_loss.asscalar())

                conf_loss_sum += sum(cls_losses) / vd_batch_size
                loc_loss_sum += sum(box_losses) / vd_batch_size

            valid_conf_loss_mean = np.divide(conf_loss_sum,
                                             valid_update_number_per_epoch)
            valid_loc_loss_mean = np.divide(loc_loss_sum,
                                            valid_update_number_per_epoch)
            valid_total_loss_mean = valid_conf_loss_mean + valid_loc_loss_mean

            logging.info(
                f"valid confidence loss : {valid_conf_loss_mean} / valid localization loss : {valid_loc_loss_mean} / valid total loss : {valid_total_loss_mean}"
            )

            AP_appender = []
            round_position = 2
            class_name, precision, recall, true_positive, false_positive, threshold = precision_recall.get_PR_list(
            )
            for j, c, p, r in zip(range(len(recall)), class_name, precision,
                                  recall):
                name, AP = precision_recall.get_AP(c, p, r)
                logging.info(
                    f"class {j}'s {name} AP : {round(AP * 100, round_position)}%"
                )
                AP_appender.append(AP)
            mAP_result = np.mean(AP_appender)

            logging.info(f"mAP : {round(mAP_result * 100, round_position)}%")
            precision_recall.get_PR_curve(name=class_name,
                                          precision=precision,
                                          recall=recall,
                                          threshold=threshold,
                                          AP=AP_appender,
                                          mAP=mAP_result,
                                          folder_name=valid_graph_path,
                                          epoch=i)

            precision_recall.reset()

            if tensorboard:
                # gpu N 개를 대비한 코드 (Data Parallelism)
                dataloader_iter = iter(valid_dataloader)
                image, label, _, _, _ = next(dataloader_iter)
                if GPU_COUNT <= 1:
                    image = gluon.utils.split_and_load(image, [ctx],
                                                       even_split=False)
                    label = gluon.utils.split_and_load(label, [ctx],
                                                       even_split=False)
                else:
                    image = gluon.utils.split_and_load(image,
                                                       ctx,
                                                       even_split=False)
                    label = gluon.utils.split_and_load(label,
                                                       ctx,
                                                       even_split=False)

                ground_truth_colors = {}
                for k in range(num_classes):
                    ground_truth_colors[k] = (0, 0, 1)

                batch_image = []
                for img, lb in zip(image, label):
                    gt_boxes = lb[:, :, :4]
                    gt_ids = lb[:, :, 4:5]
                    cls_pred, box_pred, anchor = net(img)
                    ids, scores, bboxes = prediction(cls_pred, box_pred,
                                                     anchor)

                    for ig, gt_id, gt_box, id, score, bbox in zip(
                            img, gt_ids, gt_boxes, ids, scores, bboxes):
                        ig = ig.transpose((1, 2, 0)) * mx.nd.array(
                            std, ctx=ig.context) + mx.nd.array(mean,
                                                               ctx=ig.context)
                        ig = (ig * 255).clip(0, 255)

                        # ground truth box 그리기
                        ground_truth = plot_bbox(
                            ig,
                            gt_box,
                            scores=None,
                            labels=gt_id,
                            thresh=None,
                            reverse_rgb=True,
                            class_names=valid_dataset.classes,
                            absolute_coordinates=True,
                            colors=ground_truth_colors)
                        # prediction box 그리기
                        prediction_box = plot_bbox(
                            ground_truth,
                            bbox,
                            scores=score,
                            labels=id,
                            thresh=plot_class_thresh,
                            reverse_rgb=False,
                            class_names=valid_dataset.classes,
                            absolute_coordinates=True)

                        # Tensorboard에 그리기 위해 BGR -> RGB / (height, width, channel) -> (channel, height, width) 를한다.
                        prediction_box = cv2.cvtColor(prediction_box,
                                                      cv2.COLOR_BGR2RGB)
                        prediction_box = np.transpose(prediction_box,
                                                      axes=(2, 0, 1))
                        batch_image.append(
                            prediction_box)  # (batch, channel, height, width)

                summary.add_image(tag="valid_result",
                                  image=np.array(batch_image),
                                  global_step=i)
                summary.add_scalar(tag="conf_loss",
                                   value={
                                       "train_conf_loss": train_conf_loss_mean,
                                       "valid_conf_loss": valid_conf_loss_mean
                                   },
                                   global_step=i)
                summary.add_scalar(tag="loc_loss",
                                   value={
                                       "train_loc_loss": train_loc_loss_mean,
                                       "valid_loc_loss": valid_loc_loss_mean
                                   },
                                   global_step=i)
                summary.add_scalar(tag="total_loss",
                                   value={
                                       "train_total_loss":
                                       train_total_loss_mean,
                                       "valid_total_loss":
                                       valid_total_loss_mean
                                   },
                                   global_step=i)

                params = net.collect_params().values()
                if GPU_COUNT > 1:
                    for c in ctx:
                        for p in params:
                            summary.add_histogram(tag=p.name,
                                                  values=p.data(ctx=c),
                                                  global_step=i,
                                                  bins='default')
                else:
                    for p in params:
                        summary.add_histogram(tag=p.name,
                                              values=p.data(),
                                              global_step=i,
                                              bins='default')

        if i % save_period == 0:

            weight_epoch_path = os.path.join(weight_path, str(i))
            if not os.path.exists(weight_epoch_path):
                os.makedirs(weight_epoch_path)
            '''
            Hybrid models can be serialized as JSON files using the export function
            Export HybridBlock to json format that can be loaded by SymbolBlock.imports, mxnet.mod.Module or the C++ interface.
            When there are only one input, it will have name data. When there Are more than one inputs, they will be named as data0, data1, etc.
            '''

            if GPU_COUNT >= 1:
                context = mx.gpu(0)
            else:
                context = mx.cpu(0)

            postnet = PostNet(net=net, auxnet=prediction)

            try:
                net.export(os.path.join(weight_path, f"{model}"),
                           epoch=i,
                           remove_amp_cast=True)
                net.save_parameters(os.path.join(weight_path,
                                                 f"{i}.params"))  # onnx 추출용
                # network inference, decoder, nms까지 처리됨 - mxnet c++에서 편리함 / onnx로는 추출 못함.
                export_block_for_cplusplus(
                    path=os.path.join(weight_epoch_path, f"{model}_prepost"),
                    block=postnet,
                    data_shape=tuple(input_size) + tuple((3, )),
                    epoch=i,
                    preprocess=
                    True,  # c++ 에서 inference시 opencv에서 읽은 이미지 그대로 넣으면 됨
                    layout='HWC',
                    ctx=context,
                    remove_amp_cast=True)
            except Exception as E:
                logging.error(f"json, param model export 예외 발생 : {E}")
            else:
                logging.info("json, param model export 성공")
                net.collect_params().reset_ctx(ctx)

    end_time = time.time()
    learning_time = end_time - start_time
    logging.info(f"learning time : 약, {learning_time / 3600:0.2f}H")
    logging.info("optimization completed")

    if using_mlflow:
        ml.log_metric("learning time", round(learning_time / 3600, 2))
Ejemplo n.º 4
0
def run(mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225],
        anchor_alloc_size=[256, 256],
        anchor_sizes=[32, 64, 128, 256, 512],
        anchor_size_ratios=[1, pow(2, 1 / 3), pow(2, 2 / 3)],
        anchor_aspect_ratios=[0.5, 1, 2],
        anchor_box_clip=True,
        graphviz=True,
        epoch=100,
        input_size=[512, 512],
        batch_log=100,
        batch_size=16,
        batch_interval=10,
        subdivision=4,
        train_dataset_path="Dataset/train",
        valid_dataset_path="Dataset/valid",
        multiscale=True,
        factor_scale=[8, 5],
        foreground_iou_thresh=0.5,
        background_iou_thresh=0.4,
        data_augmentation=True,
        num_workers=4,
        optimizer="ADAM",
        weight_decay=0.000001,
        save_period=5,
        load_period=10,
        learning_rate=0.001,
        decay_lr=0.999,
        decay_step=10,
        GPU_COUNT=0,
        base=0,
        AMP=True,
        valid_size=8,
        eval_period=5,
        tensorboard=True,
        valid_graph_path="valid_Graph",
        valid_html_auto_open=True,
        using_mlflow=True,
        decode_number=5000,
        multiperclass=True,
        nms_thresh=0.5,
        nms_topk=500,
        iou_thresh=0.5,
        except_class_thresh=0.05,
        plot_class_thresh=0.5):
    if GPU_COUNT == 0:
        ctx = mx.cpu(0)
        AMP = False
    elif GPU_COUNT == 1:
        ctx = mx.gpu(0)
    else:
        ctx = [mx.gpu(i) for i in range(GPU_COUNT)]

    # 운영체제 확인
    if platform.system() == "Linux":
        logging.info(f"{platform.system()} OS")
    elif platform.system() == "Windows":
        logging.info(f"{platform.system()} OS")
    else:
        logging.info(f"{platform.system()} OS")

    if isinstance(ctx, (list, tuple)):
        for i, c in enumerate(ctx):
            free_memory, total_memory = mx.context.gpu_memory_info(i)
            free_memory = round(free_memory / (1024 * 1024 * 1024), 2)
            total_memory = round(total_memory / (1024 * 1024 * 1024), 2)
            logging.info(
                f'Running on {c} / free memory : {free_memory}GB / total memory {total_memory}GB'
            )
    else:
        if GPU_COUNT == 1:
            free_memory, total_memory = mx.context.gpu_memory_info(0)
            free_memory = round(free_memory / (1024 * 1024 * 1024), 2)
            total_memory = round(total_memory / (1024 * 1024 * 1024), 2)
            logging.info(
                f'Running on {ctx} / free memory : {free_memory}GB / total memory {total_memory}GB'
            )
        else:
            logging.info(f'Running on {ctx}')

    if GPU_COUNT > 0 and batch_size < GPU_COUNT:
        logging.info("batch size must be greater than gpu number")
        exit(0)

    if AMP:
        amp.init()

    if multiscale:
        logging.info("Using MultiScale")

    if data_augmentation:
        logging.info("Using Data Augmentation")

    logging.info("training Efficient Detector")
    input_shape = (1, 3) + tuple(input_size)

    net = Efficient(version=base,
                    anchor_sizes=anchor_sizes,
                    anchor_size_ratios=anchor_size_ratios,
                    anchor_aspect_ratios=anchor_aspect_ratios,
                    anchor_box_clip=anchor_box_clip,
                    alloc_size=anchor_alloc_size,
                    ctx=mx.cpu())
    train_dataloader, train_dataset = traindataloader(
        multiscale=multiscale,
        factor_scale=factor_scale,
        augmentation=data_augmentation,
        path=train_dataset_path,
        input_size=input_size,
        batch_size=batch_size,
        batch_interval=batch_interval,
        num_workers=num_workers,
        shuffle=True,
        mean=mean,
        std=std,
        net=net,
        foreground_iou_thresh=foreground_iou_thresh,
        background_iou_thresh=background_iou_thresh,
        make_target=True)

    train_update_number_per_epoch = len(train_dataloader)
    if train_update_number_per_epoch < 1:
        logging.warning("train batch size가 데이터 수보다 큼")
        exit(0)

    valid_list = glob.glob(os.path.join(valid_dataset_path, "*"))
    if valid_list:
        valid_dataloader, valid_dataset = validdataloader(
            path=valid_dataset_path,
            input_size=input_size,
            batch_size=valid_size,
            num_workers=num_workers,
            shuffle=True,
            mean=mean,
            std=std,
            net=net,
            foreground_iou_thresh=foreground_iou_thresh,
            background_iou_thresh=background_iou_thresh,
            make_target=True)
        valid_update_number_per_epoch = len(valid_dataloader)
        if valid_update_number_per_epoch < 1:
            logging.warning("valid batch size가 데이터 수보다 큼")
            exit(0)

    num_classes = train_dataset.num_class  # 클래스 수
    name_classes = train_dataset.classes

    optimizer = optimizer.upper()
    model = str(input_size[0]) + "_" + str(
        input_size[1]) + "_" + optimizer + "_EFF_" + str(base)

    weight_path = os.path.join("weights", f"{model}")
    sym_path = os.path.join(weight_path, f'{model}-symbol.json')
    param_path = os.path.join(weight_path, f'{model}-{load_period:04d}.params')
    optimizer_path = os.path.join(weight_path,
                                  f'{model}-{load_period:04d}.opt')

    if os.path.exists(param_path) and os.path.exists(sym_path):
        start_epoch = load_period
        logging.info(f"loading {os.path.basename(param_path)}\n")
        net = gluon.SymbolBlock.imports(sym_path, ['data'],
                                        param_path,
                                        ctx=ctx)
    else:
        start_epoch = 0
        net = Efficient(
            version=base,
            input_size=input_size,
            anchor_sizes=anchor_sizes,
            anchor_size_ratios=anchor_size_ratios,
            anchor_aspect_ratios=anchor_aspect_ratios,
            num_classes=num_classes,  # foreground만
            anchor_box_clip=anchor_box_clip,
            alloc_size=anchor_alloc_size,
            ctx=ctx)

        if isinstance(ctx, (list, tuple)):
            net.summary(mx.nd.ones(shape=input_shape, ctx=ctx[0]))
        else:
            net.summary(mx.nd.ones(shape=input_shape, ctx=ctx))
        '''
        active (bool, default True) – Whether to turn hybrid on or off.
        static_alloc (bool, default False) – Statically allocate memory to improve speed. Memory usage may increase.
        static_shape (bool, default False) – Optimize for invariant input shapes between iterations. Must also set static_alloc to True. Change of input shapes is still allowed but slower.
        '''
        if multiscale:
            net.hybridize(active=True, static_alloc=True, static_shape=False)
        else:
            net.hybridize(active=True, static_alloc=True, static_shape=True)

    if start_epoch + 1 >= epoch + 1:
        logging.info("this model has already been optimized")
        exit(0)

    if tensorboard:
        summary = SummaryWriter(logdir=os.path.join("mxboard", model),
                                max_queue=10,
                                flush_secs=10,
                                verbose=False)
        if isinstance(ctx, (list, tuple)):
            net.forward(mx.nd.ones(shape=input_shape, ctx=ctx[0]))
        else:
            net.forward(mx.nd.ones(shape=input_shape, ctx=ctx))
        summary.add_graph(net)
    if graphviz:
        gluoncv.utils.viz.plot_network(net,
                                       shape=input_shape,
                                       save_prefix=model)

    # optimizer
    unit = 1 if (len(train_dataset) //
                 batch_size) < 1 else len(train_dataset) // batch_size
    step = unit * decay_step
    lr_sch = mx.lr_scheduler.FactorScheduler(step=step,
                                             factor=decay_lr,
                                             stop_factor_lr=1e-12,
                                             base_lr=learning_rate)

    for p in net.collect_params().values():
        if p.grad_req != "null":
            p.grad_req = 'add'
    '''
    update_on_kvstore : bool, default None
    Whether to perform parameter updates on kvstore. If None, then trainer will choose the more
    suitable option depending on the type of kvstore. If the `update_on_kvstore` argument is
    provided, environment variable `MXNET_UPDATE_ON_KVSTORE` will be ignored.
    '''
    if optimizer.upper() == "ADAM":
        trainer = gluon.Trainer(net.collect_params(),
                                optimizer,
                                optimizer_params={
                                    "learning_rate": learning_rate,
                                    "lr_scheduler": lr_sch,
                                    "wd": weight_decay,
                                    "beta1": 0.9,
                                    "beta2": 0.999,
                                    'multi_precision': False
                                },
                                update_on_kvstore=False
                                if AMP else None)  # for Dynamic loss scaling
    elif optimizer.upper() == "RMSPROP":
        trainer = gluon.Trainer(net.collect_params(),
                                optimizer,
                                optimizer_params={
                                    "learning_rate": learning_rate,
                                    "lr_scheduler": lr_sch,
                                    "wd": weight_decay,
                                    "gamma1": 0.9,
                                    "gamma2": 0.999,
                                    'multi_precision': False
                                },
                                update_on_kvstore=False
                                if AMP else None)  # for Dynamic loss scaling
    elif optimizer.upper() == "SGD":
        trainer = gluon.Trainer(net.collect_params(),
                                optimizer,
                                optimizer_params={
                                    "learning_rate": learning_rate,
                                    "lr_scheduler": lr_sch,
                                    "wd": weight_decay,
                                    "momentum": 0.9,
                                    'multi_precision': False
                                },
                                update_on_kvstore=False
                                if AMP else None)  # for Dynamic loss scaling
    else:
        logging.error("optimizer not selected")
        exit(0)

    if AMP:
        amp.init_trainer(trainer)

    # optimizer weight 불러오기
    if os.path.exists(optimizer_path):
        try:
            trainer.load_states(optimizer_path)
        except Exception as E:
            logging.info(E)
        else:
            logging.info(f"loading {os.path.basename(optimizer_path)}\n")
    '''
    localization loss -> Smooth L1 loss 
    confidence loss -> Focal 
    '''
    confidence_loss = FocalLoss(alpha=0.25,
                                gamma=2,
                                sparse_label=True,
                                from_sigmoid=False,
                                batch_axis=None,
                                num_class=num_classes,
                                reduction="sum",
                                exclude=False)

    localization_loss = HuberLoss(rho=1,
                                  batch_axis=None,
                                  reduction="sum",
                                  exclude=False)

    prediction = Prediction(batch_size=batch_size,
                            from_sigmoid=False,
                            num_classes=num_classes,
                            decode_number=decode_number,
                            nms_thresh=nms_thresh,
                            nms_topk=nms_topk,
                            except_class_thresh=except_class_thresh,
                            multiperclass=multiperclass)

    precision_recall = Voc_2007_AP(iou_thresh=iou_thresh,
                                   class_names=name_classes)

    ctx_list = ctx if isinstance(ctx, (list, tuple)) else [ctx]
    start_time = time.time()
    for i in tqdm(range(start_epoch + 1, epoch + 1, 1),
                  initial=start_epoch + 1,
                  total=epoch):

        conf_loss_sum = 0
        loc_loss_sum = 0
        time_stamp = time.time()

        for batch_count, (image, _, cls_all, box_all,
                          _) in enumerate(train_dataloader, start=1):
            td_batch_size = image.shape[0]

            image = mx.nd.split(data=image, num_outputs=subdivision, axis=0)
            cls_all = mx.nd.split(data=cls_all,
                                  num_outputs=subdivision,
                                  axis=0)
            box_all = mx.nd.split(data=box_all,
                                  num_outputs=subdivision,
                                  axis=0)

            if subdivision == 1:
                image = [image]
                cls_all = [cls_all]
                box_all = [box_all]
            '''
            autograd 설명
            https://mxnet.apache.org/api/python/docs/tutorials/getting-started/crash-course/3-autograd.html
            '''
            with autograd.record(train_mode=True):

                cls_all_losses = []
                box_all_losses = []

                for image_split, cls_split, box_split in zip(
                        image, cls_all, box_all):

                    image_split = gluon.utils.split_and_load(image_split,
                                                             ctx_list,
                                                             even_split=False)
                    cls_split = gluon.utils.split_and_load(cls_split,
                                                           ctx_list,
                                                           even_split=False)
                    box_split = gluon.utils.split_and_load(box_split,
                                                           ctx_list,
                                                           even_split=False)

                    # prediction, target space for Data Parallelism
                    cls_losses = []
                    box_losses = []
                    total_loss = []

                    # gpu N 개를 대비한 코드 (Data Parallelism)
                    for img, cls_target, box_target in zip(
                            image_split, cls_split, box_split):
                        cls_pred, box_pred, anchor = net(img)
                        except_ignore_samples = cls_target > -1
                        positive_samples = cls_target > 0
                        positive_numbers = positive_samples.sum()

                        conf_loss = confidence_loss(
                            cls_pred, cls_target,
                            except_ignore_samples.expand_dims(axis=-1))
                        conf_loss = mx.nd.divide(conf_loss,
                                                 positive_numbers + 1)
                        cls_losses.append(conf_loss.asscalar())

                        loc_loss = localization_loss(
                            box_pred, box_target,
                            positive_samples.expand_dims(axis=-1))
                        box_losses.append(loc_loss.asscalar())

                        total_loss.append(conf_loss + loc_loss)
                    if AMP:
                        with amp.scale_loss(total_loss,
                                            trainer) as scaled_loss:
                            autograd.backward(scaled_loss)
                    else:
                        autograd.backward(total_loss)

                    cls_all_losses.append(sum(cls_losses))
                    box_all_losses.append(sum(box_losses))

            trainer.step(batch_size=td_batch_size, ignore_stale_grad=False)
            # 비우기
            for p in net.collect_params().values():
                p.zero_grad()

            conf_loss_sum += sum(cls_all_losses) / td_batch_size
            loc_loss_sum += sum(box_all_losses) / td_batch_size

            if batch_count % batch_log == 0:
                logging.info(
                    f'[Epoch {i}][Batch {batch_count}/{train_update_number_per_epoch}],'
                    f'[Speed {td_batch_size / (time.time() - time_stamp):.3f} samples/sec],'
                    f'[Lr = {trainer.learning_rate}]'
                    f'[confidence loss = {sum(cls_all_losses) / td_batch_size:.3f}]'
                    f'[localization loss = {sum(box_all_losses) / td_batch_size:.3f}]'
                )
            time_stamp = time.time()

        train_conf_loss_mean = np.divide(conf_loss_sum,
                                         train_update_number_per_epoch)
        train_loc_loss_mean = np.divide(loc_loss_sum,
                                        train_update_number_per_epoch)
        train_total_loss_mean = train_conf_loss_mean + train_loc_loss_mean

        logging.info(
            f"train confidence loss : {train_conf_loss_mean} / train localization loss : {train_loc_loss_mean} / train total loss : {train_total_loss_mean}"
        )

        if i % save_period == 0:

            weight_epoch_path = os.path.join(weight_path, str(i))
            if not os.path.exists(weight_epoch_path):
                os.makedirs(weight_epoch_path)

            # optimizer weight 저장하기
            try:
                trainer.save_states(
                    os.path.join(weight_path, f'{model}-{i:04d}.opt'))
            except Exception as E:
                logging.error(f"optimizer weight export 예외 발생 : {E}")
            else:
                logging.info("optimizer weight export 성공")
            '''
            Hybrid models can be serialized as JSON files using the export function
            Export HybridBlock to json format that can be loaded by SymbolBlock.imports, mxnet.mod.Module or the C++ interface.
            When there are only one input, it will have name data. When there Are more than one inputs, they will be named as data0, data1, etc.
            '''
            if GPU_COUNT >= 1:
                context = mx.gpu(0)
            else:
                context = mx.cpu(0)
            '''
                mxnet1.6.0 버전 에서 AMP 사용시 위에 미리 선언한 prediction을 사용하면 문제가 될 수 있다. 
                -yolo v3, gaussian yolo v3 에서는 문제가 발생한다.
                mxnet 1.5.x 버전에서는 아래와 같이 새로 선언하지 않아도 정상 동작한다.  

                block들은 함수 인자로 보낼 경우 자기 자신이 보내진다.(복사되는 것이 아님)
                export_block_for_cplusplus 에서 prediction 이 hybridize 되면서 
                미리 선언한 prediction도 hybridize화 되면서 symbol 형태가 된다. 
                이런 현상을 보면 아래와같이 다시 선언해 주는게 맞는 것 같다.
            '''
            auxnet = Prediction(from_sigmoid=False,
                                num_classes=num_classes,
                                decode_number=decode_number,
                                nms_thresh=nms_thresh,
                                nms_topk=nms_topk,
                                except_class_thresh=except_class_thresh,
                                multiperclass=multiperclass)
            postnet = PostNet(net=net, auxnet=auxnet)
            try:
                net.export(os.path.join(weight_path, f"{model}"),
                           epoch=i,
                           remove_amp_cast=True)
                net.save_parameters(os.path.join(weight_path,
                                                 f"{i}.params"))  # onnx 추출용
                # network inference, decoder, nms까지 처리됨 - mxnet c++에서 편리함
                export_block_for_cplusplus(
                    path=os.path.join(weight_epoch_path, f"{model}_prepost"),
                    block=postnet,
                    data_shape=tuple(input_size) + tuple((3, )),
                    epoch=i,
                    preprocess=
                    True,  # c++ 에서 inference시 opencv에서 읽은 이미지 그대로 넣으면 됨
                    layout='HWC',
                    ctx=context,
                    remove_amp_cast=True)

            except Exception as E:
                logging.error(f"json, param model export 예외 발생 : {E}")
            else:
                logging.info("json, param model export 성공")
                net.collect_params().reset_ctx(ctx)

        if i % eval_period == 0 and valid_list:

            conf_loss_sum = 0
            loc_loss_sum = 0

            # loss 구하기
            for image, label, cls_all, box_all, _ in valid_dataloader:

                vd_batch_size = image.shape[0]

                image = gluon.utils.split_and_load(image,
                                                   ctx_list,
                                                   even_split=False)
                label = gluon.utils.split_and_load(label,
                                                   ctx_list,
                                                   even_split=False)
                cls_all = gluon.utils.split_and_load(cls_all,
                                                     ctx_list,
                                                     even_split=False)
                box_all = gluon.utils.split_and_load(box_all,
                                                     ctx_list,
                                                     even_split=False)

                # prediction, target space for Data Parallelism
                cls_losses = []
                box_losses = []

                # gpu N 개를 대비한 코드 (Data Parallelism)
                for img, lb, cls_target, box_target in zip(
                        image, label, cls_all, box_all):
                    gt_box = lb[:, :, :4]
                    gt_id = lb[:, :, 4:5]
                    cls_pred, box_pred, anchor = net(img)
                    id, score, bbox = prediction(cls_pred, box_pred, anchor)

                    precision_recall.update(pred_bboxes=bbox,
                                            pred_labels=id,
                                            pred_scores=score,
                                            gt_boxes=gt_box,
                                            gt_labels=gt_id)

                    except_ignore_samples = cls_target > -1
                    positive_samples = cls_target > 0
                    positive_numbers = positive_samples.sum()

                    conf_loss = confidence_loss(
                        cls_pred, cls_target,
                        except_ignore_samples.expand_dims(axis=-1))
                    conf_loss = mx.nd.divide(conf_loss, positive_numbers + 1)
                    cls_losses.append(conf_loss.asscalar())

                    loc_loss = localization_loss(
                        box_pred, box_target,
                        positive_samples.expand_dims(axis=-1))
                    box_losses.append(loc_loss.asscalar())

                conf_loss_sum += sum(cls_losses) / vd_batch_size
                loc_loss_sum += sum(box_losses) / vd_batch_size

            valid_conf_loss_mean = np.divide(conf_loss_sum,
                                             valid_update_number_per_epoch)
            valid_loc_loss_mean = np.divide(loc_loss_sum,
                                            valid_update_number_per_epoch)
            valid_total_loss_mean = valid_conf_loss_mean + valid_loc_loss_mean

            logging.info(
                f"valid confidence loss : {valid_conf_loss_mean} / valid localization loss : {valid_loc_loss_mean} / valid total loss : {valid_total_loss_mean}"
            )

            AP_appender = []
            round_position = 2
            class_name, precision, recall, true_positive, false_positive, threshold = precision_recall.get_PR_list(
            )
            for j, c, p, r in zip(range(len(recall)), class_name, precision,
                                  recall):
                name, AP = precision_recall.get_AP(c, p, r)
                logging.info(
                    f"class {j}'s {name} AP : {round(AP * 100, round_position)}%"
                )
                AP_appender.append(AP)

            AP_appender = np.nan_to_num(AP_appender)
            mAP_result = np.mean(AP_appender)

            logging.info(f"mAP : {round(mAP_result * 100, round_position)}%")
            precision_recall.get_PR_curve(name=class_name,
                                          precision=precision,
                                          recall=recall,
                                          threshold=threshold,
                                          AP=AP_appender,
                                          mAP=mAP_result,
                                          folder_name=valid_graph_path,
                                          epoch=i,
                                          auto_open=valid_html_auto_open)
            precision_recall.reset()

            if tensorboard:
                # gpu N 개를 대비한 코드 (Data Parallelism)
                dataloader_iter = iter(valid_dataloader)
                image, label, _, _, _ = next(dataloader_iter)

                image = gluon.utils.split_and_load(image,
                                                   ctx_list,
                                                   even_split=False)
                label = gluon.utils.split_and_load(label,
                                                   ctx_list,
                                                   even_split=False)

                ground_truth_colors = {}
                for k in range(num_classes):
                    ground_truth_colors[k] = (0, 1, 0)

                batch_image = []
                for img, lb in zip(image, label):
                    gt_boxes = lb[:, :, :4]
                    gt_ids = lb[:, :, 4:5]
                    cls_pred, box_pred, anchor = net(img)
                    ids, scores, bboxes = prediction(cls_pred, box_pred,
                                                     anchor)

                    for ig, gt_id, gt_box, id, score, bbox in zip(
                            img, gt_ids, gt_boxes, ids, scores, bboxes):
                        ig = ig.transpose((1, 2, 0)) * mx.nd.array(
                            std, ctx=ig.context) + mx.nd.array(mean,
                                                               ctx=ig.context)
                        ig = (ig * 255).clip(0, 255)
                        ig = ig.astype(np.uint8)

                        # ground truth box 그리기
                        ground_truth = plot_bbox(
                            ig,
                            gt_box,
                            scores=None,
                            labels=gt_id,
                            thresh=None,
                            reverse_rgb=False,
                            class_names=valid_dataset.classes,
                            absolute_coordinates=True,
                            colors=ground_truth_colors)
                        # prediction box 그리기
                        prediction_box = plot_bbox(
                            ground_truth,
                            bbox,
                            scores=score,
                            labels=id,
                            thresh=plot_class_thresh,
                            reverse_rgb=False,
                            class_names=valid_dataset.classes,
                            absolute_coordinates=True)

                        # Tensorboard에 그리기 (height, width, channel) -> (channel, height, width) 를한다.
                        prediction_box = np.transpose(prediction_box,
                                                      axes=(2, 0, 1))
                        batch_image.append(
                            prediction_box)  # (batch, channel, height, width)

                summary.add_image(tag="valid_result",
                                  image=np.array(batch_image),
                                  global_step=i)
                summary.add_scalar(tag="conf_loss",
                                   value={
                                       "train_conf_loss": train_conf_loss_mean,
                                       "valid_conf_loss": valid_conf_loss_mean
                                   },
                                   global_step=i)
                summary.add_scalar(tag="loc_loss",
                                   value={
                                       "train_loc_loss": train_loc_loss_mean,
                                       "valid_loc_loss": valid_loc_loss_mean
                                   },
                                   global_step=i)
                summary.add_scalar(tag="total_loss",
                                   value={
                                       "train_total_loss":
                                       train_total_loss_mean,
                                       "valid_total_loss":
                                       valid_total_loss_mean
                                   },
                                   global_step=i)

                for p in net.collect_params().values():
                    summary.add_histogram(tag=p.name,
                                          values=p.data(ctx=ctx_list[0]),
                                          global_step=i,
                                          bins='default')

    end_time = time.time()
    learning_time = end_time - start_time
    logging.info(f"learning time : 약, {learning_time / 3600:0.2f}H")
    logging.info("optimization completed")

    if using_mlflow:
        ml.log_metric("learning time", round(learning_time / 3600, 2))
Ejemplo n.º 5
0
def run(mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225],
        epoch=100,
        input_size=[512, 512],
        input_frame_number=2,
        batch_size=16,
        batch_log=100,
        subdivision=4,
        train_dataset_path="Dataset/train",
        valid_dataset_path="Dataset/valid",
        data_augmentation=True,
        num_workers=4,
        optimizer="ADAM",
        lambda_off=1,
        lambda_size=0.1,
        save_period=5,
        load_period=10,
        learning_rate=0.001,
        decay_lr=0.999,
        decay_step=10,
        GPU_COUNT=0,
        base=18,
        pretrained_base=True,
        valid_size=8,
        eval_period=5,
        tensorboard=True,
        valid_graph_path="valid_Graph",
        valid_html_auto_open=True,
        using_mlflow=True,
        topk=100,
        iou_thresh=0.5,
        nms=False,
        except_class_thresh=0.01,
        nms_thresh=0.5,
        plot_class_thresh=0.5):
    if GPU_COUNT == 0:
        device = torch.device("cpu")
    elif GPU_COUNT == 1:
        device = torch.device("cuda")
    else:
        device = [torch.device(f"cuda:{i}") for i in range(0, GPU_COUNT)]

    if isinstance(device, (list, tuple)):
        context = device[0]
    else:
        context = device

    # 운영체제 확인
    if platform.system() == "Linux":
        logging.info(f"{platform.system()} OS")
    elif platform.system() == "Windows":
        logging.info(f"{platform.system()} OS")
    else:
        logging.info(f"{platform.system()} OS")

    # free memory는 정확하지 않은 것 같고, torch.cuda.max_memory_allocated() 가 정확히 어떻게 동작하는지?
    if isinstance(device, (list, tuple)):
        for i, d in enumerate(device):
            total_memory = torch.cuda.get_device_properties(d).total_memory
            free_memory = total_memory - torch.cuda.max_memory_allocated(d)
            free_memory = round(free_memory / (1024**3), 2)
            total_memory = round(total_memory / (1024**3), 2)
            logging.info(f'{torch.cuda.get_device_name(d)}')
            logging.info(
                f'Running on {d} / free memory : {free_memory}GB / total memory {total_memory}GB'
            )
    else:
        if GPU_COUNT == 1:
            total_memory = torch.cuda.get_device_properties(
                device).total_memory
            free_memory = total_memory - torch.cuda.max_memory_allocated(
                device)
            free_memory = round(free_memory / (1024**3), 2)
            total_memory = round(total_memory / (1024**3), 2)
            logging.info(f'{torch.cuda.get_device_name(device)}')
            logging.info(
                f'Running on {device} / free memory : {free_memory}GB / total memory {total_memory}GB'
            )
        else:
            logging.info(f'Running on {device}')

    if GPU_COUNT > 0 and batch_size < GPU_COUNT:
        logging.info("batch size must be greater than gpu number")
        exit(0)

    if data_augmentation:
        logging.info("Using Data Augmentation")

    logging.info("training Center Detector")
    input_shape = (1, 3 * input_frame_number) + tuple(input_size)

    scale_factor = 4  # 고정
    logging.info(f"scale factor {scale_factor}")

    train_dataloader, train_dataset = traindataloader(
        augmentation=data_augmentation,
        path=train_dataset_path,
        input_size=input_size,
        input_frame_number=input_frame_number,
        batch_size=batch_size,
        pin_memory=True,
        num_workers=num_workers,
        shuffle=True,
        mean=mean,
        std=std,
        scale_factor=scale_factor,
        make_target=True)

    train_update_number_per_epoch = len(train_dataloader)
    if train_update_number_per_epoch < 1:
        logging.warning("train batch size가 데이터 수보다 큼")
        exit(0)

    valid_list = glob.glob(os.path.join(valid_dataset_path, "*"))
    if valid_list:
        valid_dataloader, valid_dataset = validdataloader(
            path=valid_dataset_path,
            input_size=input_size,
            input_frame_number=input_frame_number,
            batch_size=valid_size,
            num_workers=num_workers,
            pin_memory=True,
            shuffle=True,
            mean=mean,
            std=std,
            scale_factor=scale_factor,
            make_target=True)
        valid_update_number_per_epoch = len(valid_dataloader)
        if valid_update_number_per_epoch < 1:
            logging.warning("valid batch size가 데이터 수보다 큼")
            exit(0)

    num_classes = train_dataset.num_class  # 클래스 수
    name_classes = train_dataset.classes

    optimizer = optimizer.upper()
    if pretrained_base:
        model = str(input_size[0]) + "_" + str(
            input_size[1]) + "_" + optimizer + "_P" + "CENTER_RES" + str(base)
    else:
        model = str(input_size[0]) + "_" + str(
            input_size[1]) + "_" + optimizer + "_CENTER_RES" + str(base)

    # https://discuss.pytorch.org/t/how-to-save-the-optimizer-setting-in-a-log-in-pytorch/17187
    weight_path = os.path.join("weights", f"{model}")
    param_path = os.path.join(weight_path, f'{model}-{load_period:04d}.pt')

    start_epoch = 0
    net = CenterNet(base=base,
                    input_frame_number=input_frame_number,
                    heads=OrderedDict([('heatmap', {
                        'num_output': num_classes,
                        'bias': -2.19
                    }), ('offset', {
                        'num_output': 2
                    }), ('wh', {
                        'num_output': 2
                    })]),
                    head_conv_channel=64,
                    pretrained=pretrained_base)

    # https://github.com/sksq96/pytorch-summary
    modelsummary(net.to(context), input_shape[1:])

    if tensorboard:
        summary = SummaryWriter(log_dir=os.path.join("torchboard", model),
                                max_queue=10,
                                flush_secs=10)
        summary.add_graph(net.to(context),
                          input_to_model=torch.ones(input_shape,
                                                    device=context),
                          verbose=False)

    if os.path.exists(param_path):
        start_epoch = load_period
        checkpoint = torch.load(param_path)
        if 'model_state_dict' in checkpoint:
            try:
                net.load_state_dict(checkpoint['model_state_dict'])
            except Exception as E:
                logging.info(E)
            else:
                logging.info(f"loading model_state_dict")

    if start_epoch + 1 >= epoch + 1:
        logging.info("this model has already been optimized")
        exit(0)

    net.to(context)

    if optimizer.upper() == "ADAM":
        trainer = Adam(net.parameters(),
                       lr=learning_rate,
                       betas=(0.9, 0.999),
                       weight_decay=0.000001)
    elif optimizer.upper() == "RMSPROP":
        trainer = RMSprop(net.parameters(),
                          lr=learning_rate,
                          alpha=0.99,
                          weight_decay=0.000001,
                          momentum=0)
    elif optimizer.upper() == "SGD":
        trainer = SGD(net.parameters(),
                      lr=learning_rate,
                      momentum=0.9,
                      weight_decay=0.000001)
    else:
        logging.error("optimizer not selected")
        exit(0)

    if os.path.exists(param_path):
        # optimizer weight 불러오기
        checkpoint = torch.load(param_path)
        if 'optimizer_state_dict' in checkpoint:
            try:
                trainer.load_state_dict(checkpoint['optimizer_state_dict'])
            except Exception as E:
                logging.info(E)
            else:
                logging.info(f"loading optimizer_state_dict")

    if isinstance(device, (list, tuple)):
        net = DataParallel(net,
                           device_ids=device,
                           output_device=context,
                           dim=0)

    # optimizer
    # https://pytorch.org/docs/master/optim.html?highlight=lr%20sche#torch.optim.lr_scheduler.CosineAnnealingLR
    unit = 1 if (len(train_dataset) //
                 batch_size) < 1 else len(train_dataset) // batch_size
    step = unit * decay_step
    lr_sch = lr_scheduler.StepLR(trainer, step, gamma=decay_lr, last_epoch=-1)

    heatmapfocalloss = HeatmapFocalLoss(from_sigmoid=True, alpha=2, beta=4)
    normedl1loss = NormedL1Loss()
    prediction = Prediction(batch_size=valid_size,
                            unique_ids=name_classes,
                            topk=topk,
                            scale=scale_factor,
                            nms=nms,
                            except_class_thresh=except_class_thresh,
                            nms_thresh=nms_thresh)
    precision_recall = Voc_2007_AP(iou_thresh=iou_thresh,
                                   class_names=name_classes)

    # torch split이 numpy, mxnet split과 달라서 아래와 같은 작업을 하는 것
    if batch_size % subdivision == 0:
        chunk = int(batch_size) // int(subdivision)
    else:
        logging.info(f"batch_size / subdivision 이 나누어 떨어지지 않습니다.")
        logging.info(f"subdivision 을 다시 설정하고 학습 진행하세요.")
        exit(0)

    start_time = time.time()
    for i in tqdm(range(start_epoch + 1, epoch + 1, 1),
                  initial=start_epoch + 1,
                  total=epoch):

        heatmap_loss_sum = 0
        offset_loss_sum = 0
        wh_loss_sum = 0
        time_stamp = time.time()

        # multiscale을 하게되면 여기서 train_dataloader을 다시 만드는 것이 좋겠군..
        for batch_count, (image, _, heatmap_target, offset_target, wh_target,
                          mask_target, _) in enumerate(train_dataloader,
                                                       start=1):

            trainer.zero_grad()

            image = image.to(context)
            '''
            이렇게 하는 이유?
            209 line에서 net = net.to(context)로 함
            gpu>=1 인 경우 net = DataParallel(net, device_ids=device, output_device=context, dim=0) 에서 
            output_device - gradient가 계산되는 곳을 context로 했기 때문에 아래의 target들도 context로 지정해줘야 함
            '''
            heatmap_target = heatmap_target.to(context)
            offset_target = offset_target.to(context)
            wh_target = wh_target.to(context)
            mask_target = mask_target.to(context)

            image_split = torch.split(image, chunk, dim=0)
            heatmap_target_split = torch.split(heatmap_target, chunk, dim=0)
            offset_target_split = torch.split(offset_target, chunk, dim=0)
            wh_target_split = torch.split(wh_target, chunk, dim=0)
            mask_target_split = torch.split(mask_target, chunk, dim=0)

            heatmap_losses = []
            offset_losses = []
            wh_losses = []
            total_loss = []

            for image_part, heatmap_target_part, offset_target_part, wh_target_part, mask_target_part in zip(
                    image_split, heatmap_target_split, offset_target_split,
                    wh_target_split, mask_target_split):
                heatmap_pred, offset_pred, wh_pred = net(image_part)
                '''
                pytorch는 trainer.step()에서 batch_size 인자가 없다.
                Loss 구현시 고려해야 한다.(mean 모드) 
                '''
                heatmap_loss = torch.div(
                    heatmapfocalloss(heatmap_pred, heatmap_target_part),
                    subdivision)
                offset_loss = torch.div(
                    normedl1loss(offset_pred, offset_target_part,
                                 mask_target_part) * lambda_off, subdivision)
                wh_loss = torch.div(
                    normedl1loss(wh_pred, wh_target_part, mask_target_part) *
                    lambda_size, subdivision)

                heatmap_losses.append(heatmap_loss.item())
                offset_losses.append(offset_loss.item())
                wh_losses.append(wh_loss.item())

                total_loss.append(heatmap_loss + offset_loss + wh_loss)

            # batch size만큼 나눠줘야 하지 않나?
            autograd.backward(total_loss)

            trainer.step()
            lr_sch.step()

            heatmap_loss_sum += sum(heatmap_losses)
            offset_loss_sum += sum(offset_losses)
            wh_loss_sum += sum(wh_losses)

            if batch_count % batch_log == 0:
                logging.info(
                    f'[Epoch {i}][Batch {batch_count}/{train_update_number_per_epoch}],'
                    f'[Speed {image.shape[0] / (time.time() - time_stamp):.3f} samples/sec],'
                    f'[Lr = {lr_sch.get_last_lr()}]'
                    f'[heatmap loss = {sum(heatmap_losses):.3f}]'
                    f'[offset loss = {sum(offset_losses):.3f}]'
                    f'[wh loss = {sum(wh_losses):.3f}]')
            time_stamp = time.time()

        train_heatmap_loss_mean = np.divide(heatmap_loss_sum,
                                            train_update_number_per_epoch)
        train_offset_loss_mean = np.divide(offset_loss_sum,
                                           train_update_number_per_epoch)
        train_wh_loss_mean = np.divide(wh_loss_sum,
                                       train_update_number_per_epoch)
        train_total_loss_mean = train_heatmap_loss_mean + train_offset_loss_mean + train_wh_loss_mean

        logging.info(
            f"train heatmap loss : {train_heatmap_loss_mean} / train offset loss : {train_offset_loss_mean} / train wh loss : {train_wh_loss_mean} / train total loss : {train_total_loss_mean}"
        )

        if i % save_period == 0:

            if not os.path.exists(weight_path):
                os.makedirs(weight_path)

            module = net.module if isinstance(device, (list, tuple)) else net
            auxnet = Prediction(unique_ids=name_classes,
                                topk=topk,
                                scale=scale_factor,
                                nms=nms,
                                except_class_thresh=except_class_thresh,
                                nms_thresh=nms_thresh)
            prepostnet = PrePostNet(
                net=module,
                auxnet=auxnet,
                input_frame_number=input_frame_number)  # 새로운 객체가 생성

            try:
                torch.save(
                    {
                        'model_state_dict': net.state_dict(),
                        'optimizer_state_dict': trainer.state_dict()
                    }, os.path.join(weight_path, f'{model}-{i:04d}.pt'))

                # torch.jit.trace() 보다는 control-flow 연산 적용이 가능한 torch.jit.script() 을 사용하자
                # torch.jit.script
                script = torch.jit.script(module)
                script.save(os.path.join(weight_path, f'{model}-{i:04d}.jit'))

                script = torch.jit.script(prepostnet)
                script.save(
                    os.path.join(weight_path, f'{model}-prepost-{i:04d}.jit'))

                # # torch.jit.trace - 안 써짐
                # 오류 : Expected object of device type cuda but got device type cpu for argument #2 'other' in call to _th_fmod
                # trace = torch.jit.trace(prepostnet, torch.rand(input_shape[0], input_shape[1], input_shape[2], input_shape[3], device=context))
                # trace.save(os.path.join(weight_path, f'{model}-{i:04d}.jit'))

            except Exception as E:
                logging.error(f"pt, jit export 예외 발생 : {E}")
            else:
                logging.info("pt, jit export 성공")

        if i % eval_period == 0 and valid_list:

            heatmap_loss_sum = 0
            offset_loss_sum = 0
            wh_loss_sum = 0

            # loss 구하기
            for image, label, heatmap_target, offset_target, wh_target, mask_target, _ in valid_dataloader:
                image = image.to(context)
                label = label.to(context)
                gt_box = label[:, :, :4]
                gt_id = label[:, :, 4:5]

                heatmap_target = heatmap_target.to(context)
                offset_target = offset_target.to(context)
                wh_target = wh_target.to(context)
                mask_target = mask_target.to(context)

                heatmap_pred, offset_pred, wh_pred = net(image)
                id, score, bbox = prediction(heatmap_pred, offset_pred,
                                             wh_pred)
                precision_recall.update(pred_bboxes=bbox,
                                        pred_labels=id,
                                        pred_scores=score,
                                        gt_boxes=gt_box * scale_factor,
                                        gt_labels=gt_id)

                heatmap_loss = heatmapfocalloss(heatmap_pred, heatmap_target)
                offset_loss = normedl1loss(offset_pred, offset_target,
                                           mask_target) * lambda_off
                wh_loss = normedl1loss(wh_pred, wh_target,
                                       mask_target) * lambda_size

                heatmap_loss_sum += heatmap_loss.item()
                offset_loss_sum += offset_loss.item()
                wh_loss_sum += wh_loss.item()

            valid_heatmap_loss_mean = np.divide(heatmap_loss_sum,
                                                valid_update_number_per_epoch)
            valid_offset_loss_mean = np.divide(offset_loss_sum,
                                               valid_update_number_per_epoch)
            valid_wh_loss_mean = np.divide(wh_loss_sum,
                                           valid_update_number_per_epoch)
            valid_total_loss_mean = valid_heatmap_loss_mean + valid_offset_loss_mean + valid_wh_loss_mean

            logging.info(
                f"valid heatmap loss : {valid_heatmap_loss_mean} / valid offset loss : {valid_offset_loss_mean} / valid wh loss : {valid_wh_loss_mean} / valid total loss : {valid_total_loss_mean}"
            )

            AP_appender = []
            round_position = 2
            class_name, precision, recall, true_positive, false_positive, threshold = precision_recall.get_PR_list(
            )
            for j, c, p, r in zip(range(len(recall)), class_name, precision,
                                  recall):
                name, AP = precision_recall.get_AP(c, p, r)
                logging.info(
                    f"class {j}'s {name} AP : {round(AP * 100, round_position)}%"
                )
                AP_appender.append(AP)
            mAP_result = np.mean(AP_appender)

            logging.info(f"mAP : {round(mAP_result * 100, round_position)}%")
            precision_recall.get_PR_curve(name=class_name,
                                          precision=precision,
                                          recall=recall,
                                          threshold=threshold,
                                          AP=AP_appender,
                                          mAP=mAP_result,
                                          folder_name=valid_graph_path,
                                          epoch=i,
                                          auto_open=valid_html_auto_open)
            precision_recall.reset()

            if tensorboard:

                batch_image = []
                ground_truth_colors = {}
                for k in range(num_classes):
                    ground_truth_colors[k] = (0, 1, 0)  # RGB

                dataloader_iter = iter(valid_dataloader)
                image, label, _, _, _, _, _ = next(dataloader_iter)

                image = image.to(context)
                label = label.to(context)
                gt_boxes = label[:, :, :4]
                gt_ids = label[:, :, 4:5]

                heatmap_pred, offset_pred, wh_pred = net(image)
                ids, scores, bboxes = prediction(heatmap_pred, offset_pred,
                                                 wh_pred)

                for img, gt_id, gt_box, heatmap, id, score, bbox in zip(
                        image, gt_ids, gt_boxes, heatmap_pred, ids, scores,
                        bboxes):

                    split_img = torch.split(img, 3,
                                            dim=0)  # numpy split과 다르네...
                    hconcat_image_list = []

                    for j, ig in enumerate(split_img):

                        ig = ig.permute((1, 2, 0)) * torch.tensor(
                            std, device=ig.device) + torch.tensor(
                                mean, device=ig.device)
                        ig = (ig * 255).clamp(0, 255)
                        ig = ig.to(torch.uint8)
                        ig = ig.detach().cpu().numpy().copy()

                        if j == len(split_img) - 1:  # 마지막 이미지
                            # heatmap 그리기
                            heatmap = heatmap.detach().cpu().numpy().copy()
                            heatmap = np.multiply(heatmap,
                                                  255.0)  # 0 ~ 255 범위로 바꾸기
                            heatmap = np.amax(
                                heatmap, axis=0,
                                keepdims=True)  # channel 축으로 가장 큰것 뽑기
                            heatmap = np.transpose(
                                heatmap,
                                axes=(1, 2, 0))  # (height, width, channel=1)
                            heatmap = np.repeat(heatmap, 3, axis=-1)
                            heatmap = heatmap.astype(
                                "uint8")  # float32 -> uint8
                            heatmap = cv2.resize(
                                heatmap,
                                dsize=(input_size[1], input_size[0]))  # 사이즈 원복
                            heatmap = cv2.applyColorMap(
                                heatmap, cv2.COLORMAP_JET)
                            heatmap = cv2.cvtColor(heatmap, cv2.COLOR_BGR2RGB)

                            # ground truth box 그리기
                            ground_truth = plot_bbox(
                                ig,
                                gt_box * scale_factor,
                                scores=None,
                                labels=gt_id,
                                thresh=None,
                                reverse_rgb=False,
                                class_names=valid_dataset.classes,
                                absolute_coordinates=True,
                                colors=ground_truth_colors)
                            # # prediction box 그리기
                            prediction_box = plot_bbox(
                                ground_truth,
                                bbox,
                                scores=score,
                                labels=id,
                                thresh=plot_class_thresh,
                                reverse_rgb=False,
                                class_names=valid_dataset.classes,
                                absolute_coordinates=True,
                                heatmap=heatmap)
                            hconcat_image_list.append(prediction_box)
                        else:
                            hconcat_image_list.append(ig)

                    hconcat_images = np.concatenate(hconcat_image_list, axis=1)

                    # Tensorboard에 그리기 위해 (height, width, channel) -> (channel, height, width) 를한다.
                    hconcat_images = np.transpose(hconcat_images,
                                                  axes=(2, 0, 1))
                    batch_image.append(
                        hconcat_images)  # (batch, channel, height, width)

                img_grid = torchvision.utils.make_grid(
                    torch.as_tensor(batch_image), nrow=1)
                summary.add_image(tag="valid_result",
                                  img_tensor=img_grid,
                                  global_step=i)

                summary.add_scalar(tag="heatmap_loss/train_heatmap_loss_mean",
                                   scalar_value=train_heatmap_loss_mean,
                                   global_step=i)
                summary.add_scalar(tag="heatmap_loss/valid_heatmap_loss_mean",
                                   scalar_value=valid_heatmap_loss_mean,
                                   global_step=i)

                summary.add_scalar(tag="offset_loss/train_offset_loss_mean",
                                   scalar_value=train_offset_loss_mean,
                                   global_step=i)
                summary.add_scalar(tag="offset_loss/valid_offset_loss_mean",
                                   scalar_value=valid_offset_loss_mean,
                                   global_step=i)

                summary.add_scalar(tag="wh_loss/train_wh_loss_mean",
                                   scalar_value=train_wh_loss_mean,
                                   global_step=i)
                summary.add_scalar(tag="wh_loss/valid_wh_loss_mean",
                                   scalar_value=valid_wh_loss_mean,
                                   global_step=i)

                summary.add_scalar(tag="total_loss/train_total_loss",
                                   scalar_value=train_total_loss_mean,
                                   global_step=i)
                summary.add_scalar(tag="total_loss/valid_total_loss",
                                   scalar_value=valid_total_loss_mean,
                                   global_step=i)

                for name, param in net.named_parameters():
                    summary.add_histogram(tag=name,
                                          values=param,
                                          global_step=i)

    end_time = time.time()
    learning_time = end_time - start_time
    logging.info(f"learning time : 약, {learning_time / 3600:0.2f}H")
    logging.info("optimization completed")

    if using_mlflow:
        ml.log_metric("learning time", round(learning_time / 3600, 2))