Exemple #1
0
    def __init__(self, config, logger=None, reporter=None):
        super(SSDEstimator, self).__init__(config, logger, reporter, name=self.__class__.__name__)

        if self._cfg.ssd.amp:
            amp.init()
        if self._cfg.horovod:
            hvd.init()
Exemple #2
0
 def __init__(self, config, logger=None, reporter=None):
     super(YOLOv3Estimator, self).__init__(config, logger, reporter)
     self.last_train = None
     if self._cfg.yolo3.amp:
         amp.init()
     if self._cfg.horovod:
         if hvd is None:
             raise SystemExit("Horovod not found, please check if you installed it correctly.")
         hvd.init()
Exemple #3
0
def train_net(net, config, check_flag, logger, sig_state, sig_pgbar, sig_table):
    print(config)
    # config = Configs()
    # matplotlib.use('Agg')
    # import matplotlib.pyplot as plt
    sig_pgbar.emit(-1)
    mx.random.seed(1)
    matplotlib.use('Agg')
    import matplotlib.pyplot as plt

    classes = 10
    num_epochs = config.train_cfg.epoch
    batch_size = config.train_cfg.batchsize
    optimizer = config.lr_cfg.optimizer
    lr = config.lr_cfg.lr
    num_gpus = config.train_cfg.gpu
    batch_size *= max(1, num_gpus)
    context = [mx.gpu(i)
               for i in range(num_gpus)] if num_gpus > 0 else [mx.cpu()]
    num_workers = config.data_cfg.worker

    warmup = config.lr_cfg.warmup
    if config.lr_cfg.decay == 'cosine':
        lr_sch = lr_scheduler.CosineScheduler((50000//batch_size)*num_epochs,
                                              base_lr=lr,
                                              warmup_steps=warmup *
                                              (50000//batch_size),
                                              final_lr=1e-5)
    else:
        lr_sch = lr_scheduler.FactorScheduler((50000//batch_size)*config.lr_cfg.factor_epoch,
                                              factor=config.lr_cfg.factor,
                                              base_lr=lr,
                                              warmup_steps=warmup*(50000//batch_size))

    model_name = config.net_cfg.name

    if config.data_cfg.mixup:
        model_name += '_mixup'
    if config.train_cfg.amp:
        model_name += '_amp'

    base_dir = './'+model_name
    if os.path.exists(base_dir):
        base_dir = base_dir + '-' + \
            time.strftime("%m-%d-%H.%M.%S", time.localtime())
    makedirs(base_dir)

    if config.save_cfg.tensorboard:
        logdir = base_dir+'/tb/'+model_name
        if os.path.exists(logdir):
            logdir = logdir + '-' + \
                time.strftime("%m-%d-%H.%M.%S", time.localtime())
        sw = SummaryWriter(logdir=logdir, flush_secs=5, verbose=False)
        cmd_file = open(base_dir+'/tb.bat', mode='w')
        cmd_file.write('tensorboard --logdir=./')
        cmd_file.close()

    save_period = 10
    save_dir = base_dir+'/'+'params'
    makedirs(save_dir)

    plot_name = base_dir+'/'+'plot'
    makedirs(plot_name)

    stat_name = base_dir+'/'+'stat.txt'

    csv_name = base_dir+'/'+'data.csv'
    if os.path.exists(csv_name):
        csv_name = base_dir+'/'+'data-' + \
            time.strftime("%m-%d-%H.%M.%S", time.localtime())+'.csv'
    csv_file = open(csv_name, mode='w', newline='')
    csv_writer = csv.writer(csv_file)
    csv_writer.writerow(['Epoch', 'train_loss', 'train_acc',
                         'valid_loss', 'valid_acc', 'lr', 'time'])

    logging_handlers = [logging.StreamHandler(), logger]
    logging_handlers.append(logging.FileHandler(
        '%s/train_cifar10_%s.log' % (model_name, model_name)))

    logging.basicConfig(level=logging.INFO, handlers=logging_handlers)
    logging.info(config)

    if config.train_cfg.amp:
        amp.init()

    if config.save_cfg.profiler:
        profiler.set_config(profile_all=True,
                            aggregate_stats=True,
                            continuous_dump=True,
                            filename=base_dir+'/%s_profile.json' % model_name)
        is_profiler_run = False

    trans_list = []
    imgsize = config.data_cfg.size
    if config.data_cfg.crop:
        trans_list.append(gcv_transforms.RandomCrop(
            32, pad=config.data_cfg.crop_pad))
    if config.data_cfg.cutout:
        trans_list.append(CutOut(config.data_cfg.cutout_size))
    if config.data_cfg.flip:
        trans_list.append(transforms.RandomFlipLeftRight())
    if config.data_cfg.erase:
        trans_list.append(gcv_transforms.block.RandomErasing(s_max=0.25))
    trans_list.append(transforms.Resize(imgsize))
    trans_list.append(transforms.ToTensor())
    trans_list.append(transforms.Normalize([0.4914, 0.4822, 0.4465],
                                           [0.2023, 0.1994, 0.2010]))

    transform_train = transforms.Compose(trans_list)

    transform_test = transforms.Compose([
        transforms.Resize(imgsize),
        transforms.ToTensor(),
        transforms.Normalize([0.4914, 0.4822, 0.4465],
                             [0.2023, 0.1994, 0.2010])
    ])

    def label_transform(label, classes):
        ind = label.astype('int')
        res = nd.zeros((ind.shape[0], classes), ctx=label.context)
        res[nd.arange(ind.shape[0], ctx=label.context), ind] = 1
        return res

    def test(ctx, val_data):
        metric = mx.metric.Accuracy()
        loss_fn = gluon.loss.SoftmaxCrossEntropyLoss()
        num_batch = len(val_data)
        test_loss = 0
        for i, batch in enumerate(val_data):
            data = gluon.utils.split_and_load(
                batch[0], ctx_list=ctx, batch_axis=0)
            label = gluon.utils.split_and_load(
                batch[1], ctx_list=ctx, batch_axis=0)
            outputs = [net(X) for X in data]
            loss = [loss_fn(yhat, y) for yhat, y in zip(outputs, label)]
            metric.update(label, outputs)
            test_loss += sum([l.sum().asscalar() for l in loss])
        test_loss /= batch_size * num_batch
        name, val_acc = metric.get()
        return name, val_acc, test_loss

    def train(epochs, ctx):
        if isinstance(ctx, mx.Context):
            ctx = [ctx]

        if config.train_cfg.param_init:
            init_func = getattr(mx.init, config.train_cfg.init)
            net.initialize(init_func(), ctx=ctx, force_reinit=True)
        else:
            net.load_parameters(config.train_cfg.param_file, ctx=ctx)

        summary(net, stat_name, nd.uniform(
            shape=(1, 3, imgsize, imgsize), ctx=ctx[0]))
        # net = nn.HybridBlock()
        net.hybridize()

        root = config.dir_cfg.dataset
        train_data = gluon.data.DataLoader(
            gluon.data.vision.CIFAR10(
                root=root, train=True).transform_first(transform_train),
            batch_size=batch_size, shuffle=True, last_batch='discard', num_workers=num_workers)

        val_data = gluon.data.DataLoader(
            gluon.data.vision.CIFAR10(
                root=root, train=False).transform_first(transform_test),
            batch_size=batch_size, shuffle=False, num_workers=num_workers)

        trainer_arg = {'learning_rate': config.lr_cfg.lr,
                       'wd': config.lr_cfg.wd, 'lr_scheduler': lr_sch}
        extra_arg = eval(config.lr_cfg.extra_arg)
        trainer_arg.update(extra_arg)
        trainer = gluon.Trainer(net.collect_params(), optimizer, trainer_arg)
        if config.train_cfg.amp:
            amp.init_trainer(trainer)
        metric = mx.metric.Accuracy()
        train_metric = mx.metric.RMSE()
        loss_fn = gluon.loss.SoftmaxCrossEntropyLoss(
            sparse_label=False if config.data_cfg.mixup else True)
        train_history = TrainingHistory(['training-error', 'validation-error'])
        # acc_history = TrainingHistory(['training-acc', 'validation-acc'])
        loss_history = TrainingHistory(['training-loss', 'validation-loss'])

        iteration = 0

        best_val_score = 0

        # print('start training')
        sig_state.emit(1)
        sig_pgbar.emit(0)
        # signal.emit('Training')
        for epoch in range(epochs):
            tic = time.time()
            train_metric.reset()
            metric.reset()
            train_loss = 0
            num_batch = len(train_data)
            alpha = 1
            for i, batch in enumerate(train_data):
                if epoch == 0 and iteration == 1 and config.save_cfg.profiler:
                    profiler.set_state('run')
                    is_profiler_run = True
                if epoch == 0 and iteration == 1 and config.save_cfg.tensorboard:
                    sw.add_graph(net)
                lam = np.random.beta(alpha, alpha)
                if epoch >= epochs - 20 or not config.data_cfg.mixup:
                    lam = 1

                data_1 = gluon.utils.split_and_load(
                    batch[0], ctx_list=ctx, batch_axis=0)
                label_1 = gluon.utils.split_and_load(
                    batch[1], ctx_list=ctx, batch_axis=0)

                if not config.data_cfg.mixup:
                    data = data_1
                    label = label_1
                else:
                    data = [lam*X + (1-lam)*X[::-1] for X in data_1]
                    label = []
                    for Y in label_1:
                        y1 = label_transform(Y, classes)
                        y2 = label_transform(Y[::-1], classes)
                        label.append(lam*y1 + (1-lam)*y2)

                with ag.record():
                    output = [net(X) for X in data]
                    loss = [loss_fn(yhat, y) for yhat, y in zip(output, label)]
                if config.train_cfg.amp:
                    with ag.record():
                        with amp.scale_loss(loss, trainer) as scaled_loss:
                            ag.backward(scaled_loss)
                            # scaled_loss.backward()
                else:
                    for l in loss:
                        l.backward()
                trainer.step(batch_size)
                train_loss += sum([l.sum().asscalar() for l in loss])

                output_softmax = [nd.SoftmaxActivation(out) for out in output]
                train_metric.update(label, output_softmax)
                metric.update(label_1, output_softmax)
                name, acc = train_metric.get()
                if config.save_cfg.tensorboard:
                    sw.add_scalar(tag='lr', value=trainer.learning_rate,
                                  global_step=iteration)
                if epoch == 0 and iteration == 1 and config.save_cfg.profiler:
                    nd.waitall()
                    profiler.set_state('stop')
                    profiler.dump()
                iteration += 1
                sig_pgbar.emit(iteration)
                if check_flag()[0]:
                    sig_state.emit(2)
                while(check_flag()[0] or check_flag()[1]):
                    if check_flag()[1]:
                        print('stop')
                        return
                    else:
                        time.sleep(5)
                        print('pausing')

            epoch_time = time.time() - tic
            train_loss /= batch_size * num_batch
            name, acc = train_metric.get()
            _, train_acc = metric.get()
            name, val_acc, _ = test(ctx, val_data)
            # if config.data_cfg.mixup:
            #     train_history.update([acc, 1-val_acc])
            #     plt.cla()
            #     train_history.plot(save_path='%s/%s_history.png' %
            #                        (plot_name, model_name))
            # else:
            train_history.update([1-train_acc, 1-val_acc])
            plt.cla()
            train_history.plot(save_path='%s/%s_history.png' %
                               (plot_name, model_name))

            if val_acc > best_val_score:
                best_val_score = val_acc
                net.save_parameters('%s/%.4f-cifar-%s-%d-best.params' %
                                    (save_dir, best_val_score, model_name, epoch))

            current_lr = trainer.learning_rate
            name, val_acc, val_loss = test(ctx, val_data)

            logging.info('[Epoch %d] loss=%f train_acc=%f train_RMSE=%f\n     val_acc=%f val_loss=%f lr=%f time: %f' %
                         (epoch, train_loss, train_acc, acc, val_acc, val_loss, current_lr, epoch_time))
            loss_history.update([train_loss, val_loss])
            plt.cla()
            loss_history.plot(save_path='%s/%s_loss.png' %
                              (plot_name, model_name), y_lim=(0, 2), legend_loc='best')
            if config.save_cfg.tensorboard:
                sw._add_scalars(tag='Acc',
                                scalar_dict={'train_acc': train_acc, 'test_acc': val_acc}, global_step=epoch)
                sw._add_scalars(tag='Loss',
                                scalar_dict={'train_loss': train_loss, 'test_loss': val_loss}, global_step=epoch)

            sig_table.emit([epoch, train_loss, train_acc,
                            val_loss, val_acc, current_lr, epoch_time])
            csv_writer.writerow([epoch, train_loss, train_acc,
                                 val_loss, val_acc, current_lr, epoch_time])
            csv_file.flush()

            if save_period and save_dir and (epoch + 1) % save_period == 0:
                net.save_parameters('%s/cifar10-%s-%d.params' %
                                    (save_dir, model_name, epoch))
        if save_period and save_dir:
            net.save_parameters('%s/cifar10-%s-%d.params' %
                                (save_dir, model_name, epochs-1))

    train(num_epochs, context)
    if config.save_cfg.tensorboard:
        sw.close()

    for ctx in context:
        ctx.empty_cache()

    csv_file.close()
    logging.shutdown()
    reload(logging)
    sig_state.emit(0)
Exemple #4
0
def run(mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225],
        offset_alloc_size=(64, 64),
        anchors={"shallow": [(10, 13), (16, 30), (33, 23)],
                 "middle": [(30, 61), (62, 45), (59, 119)],
                 "deep": [(116, 90), (156, 198), (373, 326)]},
        graphviz=False,
        epoch=100,
        input_size=[416, 416],
        batch_log=100,
        batch_size=16,
        batch_interval=10,
        subdivision=4,
        train_dataset_path="Dataset/train",
        valid_dataset_path="Dataset/valid",
        multiscale=False,
        factor_scale=[13, 5],
        ignore_threshold=0.5,
        dynamic=False,
        data_augmentation=True,
        num_workers=4,
        optimizer="ADAM",
        save_period=5,
        load_period=10,
        learning_rate=0.001, decay_lr=0.999, decay_step=10,
        GPU_COUNT=0,
        Darknetlayer=53,
        pretrained_base=True,
        pretrained_path="modelparam",
        AMP=True,
        valid_size=8,
        eval_period=5,
        tensorboard=True,
        valid_graph_path="valid_Graph",
        using_mlflow=True,
        multiperclass=True,
        nms_thresh=0.5,
        nms_topk=500,
        iou_thresh=0.5,
        except_class_thresh=0.05,
        plot_class_thresh=0.5):
    if GPU_COUNT == 0:
        ctx = mx.cpu(0)
        AMP = False
    elif GPU_COUNT == 1:
        ctx = mx.gpu(0)
    else:
        ctx = [mx.gpu(i) for i in range(GPU_COUNT)]

    # 운영체제 확인
    if platform.system() == "Linux":
        logging.info(f"{platform.system()} OS")
    elif platform.system() == "Windows":
        logging.info(f"{platform.system()} OS")
    else:
        logging.info(f"{platform.system()} OS")

    if isinstance(ctx, (list, tuple)):
        for i, c in enumerate(ctx):
            free_memory, total_memory = mx.context.gpu_memory_info(i)
            free_memory = round(free_memory / (1024 * 1024 * 1024), 2)
            total_memory = round(total_memory / (1024 * 1024 * 1024), 2)
            logging.info(f'Running on {c} / free memory : {free_memory}GB / total memory {total_memory}GB')
    else:
        if GPU_COUNT == 1:
            free_memory, total_memory = mx.context.gpu_memory_info(0)
            free_memory = round(free_memory / (1024 * 1024 * 1024), 2)
            total_memory = round(total_memory / (1024 * 1024 * 1024), 2)
            logging.info(f'Running on {ctx} / free memory : {free_memory}GB / total memory {total_memory}GB')
        else:
            logging.info(f'Running on {ctx}')

    # 입력 사이즈를 32의 배수로 지정해 버리기 - stride가 일그러지는 것을 막기 위함
    if input_size[0] % 32 != 0 and input_size[1] % 32 != 0:
        logging.info("The input size must be a multiple of 32")
        exit(0)

    if GPU_COUNT > 0 and batch_size < GPU_COUNT:
        logging.info("batch size must be greater than gpu number")
        exit(0)

    if AMP:
        amp.init()

    if multiscale:
        logging.info("Using MultiScale")

    if data_augmentation:
        logging.info("Using Data Augmentation")

    logging.info("training YoloV3 Detector")
    input_shape = (1, 3) + tuple(input_size)

    try:
        net = Yolov3(Darknetlayer=Darknetlayer,
                     anchors=anchors,
                     pretrained=False,
                     ctx=mx.cpu())
        train_dataloader, train_dataset = traindataloader(multiscale=multiscale,
                                                          factor_scale=factor_scale,
                                                          augmentation=data_augmentation,
                                                          path=train_dataset_path,
                                                          input_size=input_size,
                                                          batch_size=batch_size,
                                                          batch_interval=batch_interval,
                                                          num_workers=num_workers,
                                                          shuffle=True, mean=mean, std=std,
                                                          net=net, ignore_threshold=ignore_threshold, dynamic=dynamic,
                                                          from_sigmoid=False, make_target=True)
        valid_dataloader, valid_dataset = validdataloader(path=valid_dataset_path,
                                                          input_size=input_size,
                                                          batch_size=valid_size,
                                                          num_workers=num_workers,
                                                          shuffle=True, mean=mean, std=std,
                                                          net=net, ignore_threshold=ignore_threshold, dynamic=dynamic,
                                                          from_sigmoid=False, make_target=True)

    except Exception:
        logging.info("dataset 없음")
        exit(0)

    train_update_number_per_epoch = len(train_dataloader)
    if train_update_number_per_epoch < 1:
        logging.warning("train batch size가 데이터 수보다 큼")
        exit(0)

    valid_list = glob.glob(os.path.join(valid_dataset_path, "*"))
    if valid_list:
        valid_update_number_per_epoch = len(valid_dataloader)
        if valid_update_number_per_epoch < 1:
            logging.warning("valid batch size가 데이터 수보다 큼")
            exit(0)

    num_classes = train_dataset.num_class  # 클래스 수
    name_classes = train_dataset.classes

    optimizer = optimizer.upper()
    if pretrained_base:
        model = str(input_size[0]) + "_" + str(input_size[1]) + "_" + optimizer + "_P" + "Dark_" + str(Darknetlayer)
    else:
        model = str(input_size[0]) + "_" + str(input_size[1]) + "_" + optimizer + "_Dark_" + str(Darknetlayer)

    weight_path = f"weights/{model}"
    sym_path = os.path.join(weight_path, f'{model}-symbol.json')
    param_path = os.path.join(weight_path, f'{model}-{load_period:04d}.params')

    if os.path.exists(param_path) and os.path.exists(sym_path):
        start_epoch = load_period
        logging.info(f"loading {os.path.basename(param_path)} weights\n")
        net = gluon.SymbolBlock.imports(sym_path,
                                        ['data'],
                                        param_path, ctx=ctx)
    else:
        start_epoch = 0
        '''
        mxnet c++에서 arbitrary input image 를 받기 위한 전략
        alloc_size : tuple of int, default is (128, 128)
        For advanced users. Define `alloc_size` to generate large enough offset
        maps, which will later saved in parameters. During inference, we support arbitrary
        input image by cropping corresponding area of the anchor map. This allow us
        to export to symbol so we can run it in c++, Scalar, etc.
        '''
        net = Yolov3(Darknetlayer=Darknetlayer,
                     input_size=input_size,
                     anchors=anchors,
                     num_classes=num_classes,  # foreground만
                     pretrained=pretrained_base,
                     pretrained_path=pretrained_path,
                     alloc_size=offset_alloc_size,
                     ctx=ctx)

        if isinstance(ctx, (list, tuple)):
            net.summary(mx.nd.ones(shape=input_shape, ctx=ctx[0]))
        else:
            net.summary(mx.nd.ones(shape=input_shape, ctx=ctx))

        '''
        active (bool, default True) – Whether to turn hybrid on or off.
        static_alloc (bool, default False) – Statically allocate memory to improve speed. Memory usage may increase.
        static_shape (bool, default False) – Optimize for invariant input shapes between iterations. Must also set static_alloc to True. Change of input shapes is still allowed but slower.
        '''
        if multiscale:
            net.hybridize(active=True, static_alloc=True, static_shape=False)
        else:
            net.hybridize(active=True, static_alloc=True, static_shape=True)

    if start_epoch + 1 >= epoch + 1:
        logging.info("this model has already been optimized")
        exit(0)

    if tensorboard:
        summary = SummaryWriter(logdir=os.path.join("mxboard", model), max_queue=10, flush_secs=10,
                                verbose=False)
        if isinstance(ctx, (list, tuple)):
            net.forward(mx.nd.ones(shape=input_shape, ctx=ctx[0]))
        else:
            net.forward(mx.nd.ones(shape=input_shape, ctx=ctx))
        summary.add_graph(net)
    if graphviz:
        gluoncv.utils.viz.plot_network(net, shape=input_shape, save_prefix=model)

    # optimizer
    unit = 1 if (len(train_dataset) // batch_size) < 1 else len(train_dataset) // batch_size
    step = unit * decay_step
    lr_sch = mx.lr_scheduler.FactorScheduler(step=step, factor=decay_lr, stop_factor_lr=1e-12, base_lr=learning_rate)

    for p in net.collect_params().values():
        if p.grad_req != "null":
            p.grad_req = 'add'

    if AMP:
        '''
        update_on_kvstore : bool, default None
        Whether to perform parameter updates on kvstore. If None, then trainer will choose the more
        suitable option depending on the type of kvstore. If the `update_on_kvstore` argument is
        provided, environment variable `MXNET_UPDATE_ON_KVSTORE` will be ignored.
        '''
        if optimizer.upper() == "ADAM":
            trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={"learning_rate": learning_rate,
                                                                                       "lr_scheduler": lr_sch,
                                                                                       "beta1": 0.9,
                                                                                       "beta2": 0.999,
                                                                                       'multi_precision': False},
                                    update_on_kvstore=False)  # for Dynamic loss scaling
        elif optimizer.upper() == "RMSPROP":
            trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={"learning_rate": learning_rate,
                                                                                       "lr_scheduler": lr_sch,
                                                                                       "gamma1": 0.9,
                                                                                       "gamma2": 0.999,
                                                                                       'multi_precision': False},
                                    update_on_kvstore=False)  # for Dynamic loss scaling
        elif optimizer.upper() == "SGD":
            trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={"learning_rate": learning_rate,
                                                                                       "lr_scheduler": lr_sch,
                                                                                       "wd": 0.0005,
                                                                                       "momentum": 0.9,
                                                                                       'multi_precision': False},
                                    update_on_kvstore=False)  # for Dynamic loss scaling
        else:
            logging.error("optimizer not selected")
            exit(0)

        amp.init_trainer(trainer)

    else:
        if optimizer.upper() == "ADAM":
            trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={"learning_rate": learning_rate,
                                                                                       "lr_scheduler": lr_sch,
                                                                                       "beta1": 0.9,
                                                                                       "beta2": 0.999,
                                                                                       'multi_precision': False})
        elif optimizer.upper() == "RMSPROP":
            trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={"learning_rate": learning_rate,
                                                                                       "lr_scheduler": lr_sch,
                                                                                       "gamma1": 0.9,
                                                                                       "gamma2": 0.999,
                                                                                       'multi_precision': False})
        elif optimizer.upper() == "SGD":
            trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={"learning_rate": learning_rate,
                                                                                       "lr_scheduler": lr_sch,
                                                                                       "wd": 0.0005,
                                                                                       "momentum": 0.9,
                                                                                       'multi_precision': False})

        else:
            logging.error("optimizer not selected")
            exit(0)

    loss = Yolov3Loss(sparse_label=True,
                      from_sigmoid=False,
                      batch_axis=None,
                      num_classes=num_classes,
                      reduction="sum",
                      exclude=False)

    prediction = Prediction(
        from_sigmoid=False,
        num_classes=num_classes,
        nms_thresh=nms_thresh,
        nms_topk=nms_topk,
        except_class_thresh=except_class_thresh,
        multiperclass=multiperclass)

    precision_recall = Voc_2007_AP(iou_thresh=iou_thresh, class_names=name_classes)

    start_time = time.time()
    for i in tqdm(range(start_epoch + 1, epoch + 1, 1), initial=start_epoch + 1, total=epoch):

        xcyc_loss_sum = 0
        wh_loss_sum = 0
        object_loss_sum = 0
        class_loss_sum = 0
        time_stamp = time.time()

        for batch_count, (image, _, xcyc_all, wh_all, objectness_all, class_all, weights_all, _) in enumerate(
                train_dataloader, start=1):
            td_batch_size = image.shape[0]

            image = mx.nd.split(data=image, num_outputs=subdivision, axis=0)
            xcyc_all = mx.nd.split(data=xcyc_all, num_outputs=subdivision, axis=0)
            wh_all = mx.nd.split(data=wh_all, num_outputs=subdivision, axis=0)
            objectness_all = mx.nd.split(data=objectness_all, num_outputs=subdivision, axis=0)
            class_all = mx.nd.split(data=class_all, num_outputs=subdivision, axis=0)
            weights_all = mx.nd.split(data=weights_all, num_outputs=subdivision, axis=0)

            if subdivision == 1:
                image = [image]
                xcyc_all = [xcyc_all]
                wh_all = [wh_all]
                objectness_all = [objectness_all]
                class_all = [class_all]
                weights_all = [weights_all]
            '''
            autograd 설명
            https://mxnet.apache.org/api/python/docs/tutorials/getting-started/crash-course/3-autograd.html
            '''
            with autograd.record(train_mode=True):

                xcyc_all_losses = []
                wh_all_losses = []
                object_all_losses = []
                class_all_losses = []

                for image_split, xcyc_split, wh_split, objectness_split, class_split, weights_split in zip(image,
                                                                                                           xcyc_all,
                                                                                                           wh_all,
                                                                                                           objectness_all,
                                                                                                           class_all,
                                                                                                           weights_all):

                    if GPU_COUNT <= 1:
                        image_split = gluon.utils.split_and_load(image_split, [ctx], even_split=False)
                        xcyc_split = gluon.utils.split_and_load(xcyc_split, [ctx], even_split=False)
                        wh_split = gluon.utils.split_and_load(wh_split, [ctx], even_split=False)
                        objectness_split = gluon.utils.split_and_load(objectness_split, [ctx], even_split=False)
                        class_split = gluon.utils.split_and_load(class_split, [ctx], even_split=False)
                        weights_split = gluon.utils.split_and_load(weights_split, [ctx], even_split=False)
                    else:
                        image_split = gluon.utils.split_and_load(image_split, ctx, even_split=False)
                        xcyc_split = gluon.utils.split_and_load(xcyc_split, ctx, even_split=False)
                        wh_split = gluon.utils.split_and_load(wh_split, ctx, even_split=False)
                        objectness_split = gluon.utils.split_and_load(objectness_split, ctx, even_split=False)
                        class_split = gluon.utils.split_and_load(class_split, ctx, even_split=False)
                        weights_split = gluon.utils.split_and_load(weights_split, ctx, even_split=False)

                    xcyc_losses = []
                    wh_losses = []
                    object_losses = []
                    class_losses = []
                    total_loss = []

                    # gpu N 개를 대비한 코드 (Data Parallelism)
                    for img, xcyc_target, wh_target, objectness, class_target, weights in zip(image_split, xcyc_split,
                                                                                              wh_split,
                                                                                              objectness_split,
                                                                                              class_split,
                                                                                              weights_split):
                        output1, output2, output3, anchor1, anchor2, anchor3, offset1, offset2, offset3, stride1, stride2, stride3 = net(
                            img)
                        xcyc_loss, wh_loss, object_loss, class_loss = loss(output1, output2, output3, xcyc_target,
                                                                           wh_target, objectness,
                                                                           class_target, weights)
                        xcyc_losses.append(xcyc_loss.asscalar())
                        wh_losses.append(wh_loss.asscalar())
                        object_losses.append(object_loss.asscalar())
                        class_losses.append(class_loss.asscalar())
                        total_loss.append(xcyc_loss + wh_loss + object_loss + class_loss)
                    if AMP:
                        with amp.scale_loss(total_loss, trainer) as scaled_loss:
                            autograd.backward(scaled_loss)
                    else:
                        autograd.backward(total_loss)

                    xcyc_all_losses.append(sum(xcyc_losses))
                    wh_all_losses.append(sum(wh_losses))
                    object_all_losses.append(sum(object_losses))
                    class_all_losses.append(sum(class_losses))

            trainer.step(batch_size=td_batch_size, ignore_stale_grad=False)
            # 비우기
            for p in net.collect_params().values():
                p.zero_grad()

            xcyc_loss_sum += sum(xcyc_all_losses) / td_batch_size
            wh_loss_sum += sum(wh_all_losses) / td_batch_size
            object_loss_sum += sum(object_all_losses) / td_batch_size
            class_loss_sum += sum(class_all_losses) / td_batch_size

            if batch_count % batch_log == 0:
                logging.info(f'[Epoch {i}][Batch {batch_count}/{train_update_number_per_epoch}],'
                             f'[Speed {td_batch_size / (time.time() - time_stamp):.3f} samples/sec],'
                             f'[Lr = {trainer.learning_rate}]'
                             f'[xcyc loss = {sum(xcyc_all_losses) / td_batch_size:.3f}]'
                             f'[wh loss = {sum(wh_all_losses) / td_batch_size:.3f}]'
                             f'[obj loss = {sum(object_all_losses) / td_batch_size:.3f}]'
                             f'[class loss = {sum(class_all_losses) / td_batch_size:.3f}]')
            time_stamp = time.time()

        train_xcyc_loss_mean = np.divide(xcyc_loss_sum, train_update_number_per_epoch)
        train_wh_loss_mean = np.divide(wh_loss_sum, train_update_number_per_epoch)
        train_object_loss_mean = np.divide(object_loss_sum, train_update_number_per_epoch)
        train_class_loss_mean = np.divide(class_loss_sum, train_update_number_per_epoch)
        train_total_loss_mean = train_xcyc_loss_mean + train_wh_loss_mean + train_object_loss_mean + train_class_loss_mean
        logging.info(
            f"train xcyc loss : {train_xcyc_loss_mean} / "
            f"train wh loss : {train_wh_loss_mean} / "
            f"train object loss : {train_object_loss_mean} / "
            f"train class loss : {train_class_loss_mean} / "
            f"train total loss : {train_total_loss_mean}"
        )

        if i % eval_period == 0 and valid_list:

            xcyc_loss_sum = 0
            wh_loss_sum = 0
            object_loss_sum = 0
            class_loss_sum = 0

            # loss 구하기
            for image, label, xcyc_all, wh_all, objectness_all, class_all, weights_all, _ in valid_dataloader:
                vd_batch_size, _, height, width = image.shape

                if GPU_COUNT <= 1:
                    image = gluon.utils.split_and_load(image, [ctx], even_split=False)
                    label = gluon.utils.split_and_load(label, [ctx], even_split=False)
                    xcyc_all = gluon.utils.split_and_load(xcyc_all, [ctx], even_split=False)
                    wh_all = gluon.utils.split_and_load(wh_all, [ctx], even_split=False)
                    objectness_all = gluon.utils.split_and_load(objectness_all, [ctx], even_split=False)
                    class_all = gluon.utils.split_and_load(class_all, [ctx], even_split=False)
                    weights_all = gluon.utils.split_and_load(weights_all, [ctx], even_split=False)
                else:
                    image = gluon.utils.split_and_load(image, ctx, even_split=False)
                    label = gluon.utils.split_and_load(label, ctx, even_split=False)
                    xcyc_all = gluon.utils.split_and_load(xcyc_all, ctx, even_split=False)
                    wh_all = gluon.utils.split_and_load(wh_all, ctx, even_split=False)
                    objectness_all = gluon.utils.split_and_load(objectness_all, ctx, even_split=False)
                    class_all = gluon.utils.split_and_load(class_all, ctx, even_split=False)
                    weights_all = gluon.utils.split_and_load(weights_all, ctx, even_split=False)

                xcyc_losses = []
                wh_losses = []
                object_losses = []
                class_losses = []
                total_loss = []

                # gpu N 개를 대비한 코드 (Data Parallelism)
                for img, lb, xcyc_target, wh_target, objectness, class_target, weights in zip(image, label, xcyc_all,
                                                                                              wh_all, objectness_all,
                                                                                              class_all, weights_all):
                    gt_box = lb[:, :, :4]
                    gt_id = lb[:, :, 4:5]

                    output1, output2, output3, anchor1, anchor2, anchor3, offset1, offset2, offset3, stride1, stride2, stride3 = net(
                        img)
                    id, score, bbox = prediction(output1, output2, output3, anchor1, anchor2, anchor3, offset1, offset2,
                                                 offset3, stride1, stride2, stride3)

                    precision_recall.update(pred_bboxes=bbox,
                                            pred_labels=id,
                                            pred_scores=score,
                                            gt_boxes=gt_box,
                                            gt_labels=gt_id)

                    xcyc_loss, wh_loss, object_loss, class_loss = loss(output1, output2, output3, xcyc_target,
                                                                       wh_target, objectness,
                                                                       class_target, weights)
                    xcyc_losses.append(xcyc_loss.asscalar())
                    wh_losses.append(wh_loss.asscalar())
                    object_losses.append(object_loss.asscalar())
                    class_losses.append(class_loss.asscalar())
                    total_loss.append(xcyc_losses + wh_losses + object_losses + class_losses)

                xcyc_loss_sum += sum(xcyc_losses) / vd_batch_size
                wh_loss_sum += sum(wh_losses) / vd_batch_size
                object_loss_sum += sum(object_losses) / vd_batch_size
                class_loss_sum += sum(class_losses) / vd_batch_size

            valid_xcyc_loss_mean = np.divide(xcyc_loss_sum, valid_update_number_per_epoch)
            valid_wh_loss_mean = np.divide(wh_loss_sum, valid_update_number_per_epoch)
            valid_object_loss_mean = np.divide(object_loss_sum, valid_update_number_per_epoch)
            valid_class_loss_mean = np.divide(class_loss_sum, valid_update_number_per_epoch)
            valid_total_loss_mean = valid_xcyc_loss_mean + valid_wh_loss_mean + valid_object_loss_mean + valid_class_loss_mean

            logging.info(
                f"valid xcyc loss : {valid_xcyc_loss_mean} / "
                f"valid wh loss : {valid_wh_loss_mean} / "
                f"valid object loss : {valid_object_loss_mean} / "
                f"valid class loss : {valid_class_loss_mean} / "
                f"valid total loss : {valid_total_loss_mean}"
            )

            AP_appender = []
            round_position = 2
            class_name, precision, recall, true_positive, false_positive, threshold = precision_recall.get_PR_list()
            for j, c, p, r in zip(range(len(recall)), class_name, precision, recall):
                name, AP = precision_recall.get_AP(c, p, r)
                logging.info(f"class {j}'s {name} AP : {round(AP * 100, round_position)}%")
                AP_appender.append(AP)
            mAP_result = np.mean(AP_appender)

            logging.info(f"mAP : {round(mAP_result * 100, round_position)}%")
            precision_recall.get_PR_curve(name=class_name,
                                          precision=precision,
                                          recall=recall,
                                          threshold=threshold,
                                          AP=AP_appender, mAP=mAP_result, folder_name=valid_graph_path, epoch=i)
            precision_recall.reset()

            if tensorboard:
                # gpu N 개를 대비한 코드 (Data Parallelism)
                dataloader_iter = iter(valid_dataloader)
                image, label, _, _, _, _, _, _ = next(dataloader_iter)
                if GPU_COUNT <= 1:
                    image = gluon.utils.split_and_load(image, [ctx], even_split=False)
                    label = gluon.utils.split_and_load(label, [ctx], even_split=False)
                else:
                    image = gluon.utils.split_and_load(image, ctx, even_split=False)
                    label = gluon.utils.split_and_load(label, ctx, even_split=False)

                ground_truth_colors = {}
                for k in range(num_classes):
                    ground_truth_colors[k] = (0, 0, 1)

                batch_image = []
                for img, lb in zip(image, label):
                    gt_boxes = lb[:, :, :4]
                    gt_ids = lb[:, :, 4:5]
                    output1, output2, output3, anchor1, anchor2, anchor3, offset1, offset2, offset3, stride1, stride2, stride3 = net(
                        img)
                    ids, scores, bboxes = prediction(output1, output2, output3, anchor1, anchor2, anchor3, offset1,
                                                     offset2, offset3, stride1, stride2, stride3)

                    for ig, gt_id, gt_box, id, score, bbox in zip(img, gt_ids, gt_boxes, ids, scores, bboxes):
                        ig = ig.transpose(
                            (1, 2, 0)) * mx.nd.array(std, ctx=ig.context) + mx.nd.array(mean, ctx=ig.context)
                        ig = (ig * 255).clip(0, 255)

                        # ground truth box 그리기
                        ground_truth = plot_bbox(ig, gt_box, scores=None, labels=gt_id, thresh=None,
                                                 reverse_rgb=True,
                                                 class_names=valid_dataset.classes, absolute_coordinates=True,
                                                 colors=ground_truth_colors)
                        # prediction box 그리기
                        prediction_box = plot_bbox(ground_truth, bbox, scores=score, labels=id,
                                                   thresh=plot_class_thresh,
                                                   reverse_rgb=False,
                                                   class_names=valid_dataset.classes, absolute_coordinates=True)

                        # Tensorboard에 그리기 위해 BGR -> RGB / (height, width, channel) -> (channel, height, width) 를한다.
                        prediction_box = cv2.cvtColor(prediction_box, cv2.COLOR_BGR2RGB)
                        prediction_box = np.transpose(prediction_box,
                                                      axes=(2, 0, 1))
                        batch_image.append(prediction_box)  # (batch, channel, height, width)

                summary.add_image(tag="valid_result", image=np.array(batch_image), global_step=i)

                summary.add_scalar(tag="xy_loss", value={"train_xcyc_loss": train_xcyc_loss_mean,
                                                         "valid_xcyc_loss": valid_xcyc_loss_mean}, global_step=i)
                summary.add_scalar(tag="wh_loss", value={"train_wh_loss": train_wh_loss_mean,
                                                         "valid_wh_loss": valid_wh_loss_mean}, global_step=i)
                summary.add_scalar(tag="object_loss", value={"train_object_loss": train_object_loss_mean,
                                                             "valid_object_loss": valid_object_loss_mean},
                                   global_step=i)
                summary.add_scalar(tag="class_loss", value={"train_class_loss": train_class_loss_mean,
                                                            "valid_class_loss": valid_class_loss_mean}, global_step=i)

                summary.add_scalar(tag="total_loss", value={
                    "train_total_loss": train_total_loss_mean,
                    "valid_total_loss": valid_total_loss_mean},
                                   global_step=i)

                params = net.collect_params().values()
                if GPU_COUNT > 1:
                    for c in ctx:
                        for p in params:
                            summary.add_histogram(tag=p.name, values=p.data(ctx=c), global_step=i, bins='default')
                else:
                    for p in params:
                        summary.add_histogram(tag=p.name, values=p.data(), global_step=i, bins='default')

        if i % save_period == 0:

            weight_epoch_path = os.path.join(weight_path, str(i))
            if not os.path.exists(weight_epoch_path):
                os.makedirs(weight_epoch_path)

            '''
            Hybrid models can be serialized as JSON files using the export function
            Export HybridBlock to json format that can be loaded by SymbolBlock.imports, mxnet.mod.Module or the C++ interface.
            When there are only one input, it will have name data. When there Are more than one inputs, they will be named as data0, data1, etc.
            '''

            if GPU_COUNT >= 1:
                context = mx.gpu(0)
            else:
                context = mx.cpu(0)

            postnet = PostNet(net=net, auxnet=prediction)

            try:
                net.export(os.path.join(weight_path, f"{model}"), epoch=i, remove_amp_cast=True)  # for onnx
                net.save_parameters(os.path.join(weight_path, f"{i}.params"))  # onnx 추출용
                # network inference, decoder, nms까지 처리됨 - mxnet c++에서 편리함 / onnx로는 추출 못함.
                export_block_for_cplusplus(path=os.path.join(weight_epoch_path, f"{model}_prepost"),
                                           block=postnet,
                                           data_shape=tuple(input_size) + tuple((3,)),
                                           epoch=i,
                                           preprocess=True,  # c++ 에서 inference시 opencv에서 읽은 이미지 그대로 넣으면 됨
                                           layout='HWC',
                                           ctx=context,
                                           remove_amp_cast=True)

            except Exception as E:
                logging.error(f"json, param model export 예외 발생 : {E}")
            else:
                logging.info("json, param model export 성공")
                net.collect_params().reset_ctx(ctx)

    end_time = time.time()
    learning_time = end_time - start_time
    logging.info(f"learning time : 약, {learning_time / 3600:0.2f}H")
    logging.info("optimization completed")

    if using_mlflow:
        ml.log_metric("learning time", round(learning_time / 3600, 2))
def main():
    opt = parse_args(parser)

    assert not (os.path.isdir(opt.save_dir)), "already done this experiment..."
    Path(opt.save_dir).mkdir(parents=True)

    filehandler = logging.FileHandler(
        os.path.join(opt.save_dir, opt.logging_file))
    streamhandler = logging.StreamHandler()
    logger = logging.getLogger('')
    logger.setLevel(logging.INFO)
    logger.addHandler(filehandler)
    logger.addHandler(streamhandler)
    logger.info(opt)

    sw = SummaryWriter(logdir=opt.save_dir, flush_secs=5, verbose=False)

    if opt.use_amp:
        amp.init()

    batch_size = opt.batch_size
    classes = opt.num_classes

    # num_gpus = opt.num_gpus
    # batch_size *= max(1, num_gpus)
    # logger.info('Total batch size is set to %d on %d GPUs' % (batch_size, num_gpus))
    # context = [mx.gpu(i) for i in range(num_gpus)] if num_gpus > 0 else [mx.cpu()]
    # num_workers = opt.num_workers

    num_gpus = 1
    context = [mx.gpu(i) for i in range(num_gpus)]
    per_device_batch_size = 5
    num_workers = 12
    batch_size = per_device_batch_size * num_gpus

    lr_decay = opt.lr_decay
    lr_decay_period = opt.lr_decay_period
    if opt.lr_decay_period > 0:
        lr_decay_epoch = list(
            range(lr_decay_period, opt.num_epochs, lr_decay_period))
    else:
        lr_decay_epoch = [int(i) for i in opt.lr_decay_epoch.split(',')]
    lr_decay_epoch = [e - opt.warmup_epochs for e in lr_decay_epoch]

    if opt.slowfast:
        optimizer = 'nag'
    else:
        optimizer = 'sgd'

    if opt.clip_grad > 0:
        optimizer_params = {
            'learning_rate': opt.lr,
            'wd': opt.wd,
            'momentum': opt.momentum,
            'clip_gradient': opt.clip_grad
        }
    else:
        # optimizer_params = {'learning_rate': opt.lr, 'wd': opt.wd, 'momentum': opt.momentum}
        optimizer_params = {'wd': opt.wd, 'momentum': opt.momentum}

    if opt.dtype != 'float32':
        optimizer_params['multi_precision'] = True

    model_name = opt.model
    if opt.use_pretrained and len(opt.hashtag) > 0:
        opt.use_pretrained = opt.hashtag
    net = get_model(name=model_name,
                    nclass=classes,
                    pretrained=opt.use_pretrained,
                    use_tsn=opt.use_tsn,
                    num_segments=opt.num_segments,
                    partial_bn=opt.partial_bn,
                    bn_frozen=opt.freeze_bn)
    # net.cast(opt.dtype)
    net.collect_params().reset_ctx(context)
    logger.info(net)

    resume_params = find_model_params(opt)
    if resume_params is not '':
        net.load_parameters(resume_params, ctx=context)
        print('Continue training from model %s.' % (resume_params))

    train_data, val_data, batch_fn = get_data_loader(opt, batch_size,
                                                     num_workers, logger)

    iterations_per_epoch = len(train_data) // opt.accumulate
    lr_scheduler = CyclicalSchedule(CosineAnnealingSchedule,
                                    min_lr=0,
                                    max_lr=opt.lr,
                                    cycle_length=opt.T_0 *
                                    iterations_per_epoch,
                                    cycle_length_decay=opt.T_mult,
                                    cycle_magnitude_decay=1)
    optimizer_params['lr_scheduler'] = lr_scheduler

    optimizer = mx.optimizer.SGD(**optimizer_params)
    train_metric = mx.metric.Accuracy()
    acc_top1 = mx.metric.Accuracy()
    acc_top5 = mx.metric.TopKAccuracy(5)

    def test(ctx, val_data, kvstore="None"):
        acc_top1.reset()
        acc_top5.reset()
        #get weights
        weights = get_weights(opt).reshape(1, opt.num_classes)
        weights = mx.nd.array(weights, ctx=mx.gpu(0))

        L = gluon.loss.SoftmaxCrossEntropyLoss()

        num_test_iter = len(val_data)
        val_loss_epoch = 0
        for i, batch in enumerate(val_data):
            data, label = batch_fn(batch, ctx)
            outputs = []
            for _, X in enumerate(data):
                X = X.reshape((-1, ) + X.shape[2:])
                pred = net(X.astype(opt.dtype, copy=False))
                outputs.append(pred)

            if (opt.balanced):
                loss = [
                    L(yhat, y.astype(opt.dtype, copy=False), weights)
                    for yhat, y in zip(outputs, label)
                ]
            else:
                loss = [
                    L(yhat, y.astype(opt.dtype, copy=False))
                    for yhat, y in zip(outputs, label)
                ]

            # loss = [L(yhat, y.astype(opt.dtype, copy=False)) for yhat, y in zip(outputs, label)]

            acc_top1.update(label, outputs)
            acc_top5.update(label, outputs)

            val_loss_epoch += sum([l.mean().asscalar()
                                   for l in loss]) / len(loss)

            if opt.log_interval and not (i + 1) % opt.log_interval:
                _, top1 = acc_top1.get()
                _, top5 = acc_top5.get()
                logger.info('Batch [%04d]/[%04d]: acc-top1=%f acc-top5=%f' %
                            (i, num_test_iter, top1 * 100, top5 * 100))

        _, top1 = acc_top1.get()
        _, top5 = acc_top5.get()
        val_loss = val_loss_epoch / num_test_iter

        return (top1, top5, val_loss)

    def train(ctx):
        if isinstance(ctx, mx.Context):
            ctx = [ctx]

        if opt.no_wd:
            for k, v in net.collect_params('.*beta|.*gamma|.*bias').items():
                v.wd_mult = 0.0

        if opt.partial_bn:
            train_patterns = "None"
            if 'inceptionv3' in opt.model:
                train_patterns = '.*weight|.*bias|inception30_batchnorm0_gamma|inception30_batchnorm0_beta|inception30_batchnorm0_running_mean|inception30_batchnorm0_running_var'
            elif 'inceptionv1' in opt.model:
                train_patterns = '.*weight|.*bias|googlenet0_batchnorm0_gamma|googlenet0_batchnorm0_beta|googlenet0_batchnorm0_running_mean|googlenet0_batchnorm0_running_var'
            else:
                logger.info(
                    'Current model does not support partial batch normalization.'
                )

            # trainer = gluon.Trainer(net.collect_params(train_patterns), optimizer, optimizer_params, update_on_kvstore=False)
            trainer = gluon.Trainer(net.collect_params(train_patterns),
                                    optimizer,
                                    update_on_kvstore=False)

        elif opt.freeze_bn:
            train_patterns = '.*weight|.*bias'
            # trainer = gluon.Trainer(net.collect_params(train_patterns), optimizer, optimizer_params, update_on_kvstore=False)
            trainer = gluon.Trainer(net.collect_params(train_patterns),
                                    optimizer,
                                    update_on_kvstore=False)

        else:
            # trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params, update_on_kvstore=False)
            trainer = gluon.Trainer(net.collect_params(),
                                    optimizer,
                                    update_on_kvstore=False)

        if opt.accumulate > 1:
            params = [
                p for p in net.collect_params().values()
                if p.grad_req != 'null'
            ]
            for p in params:
                p.grad_req = 'add'

        if opt.resume_states is not '':
            trainer.load_states(opt.resume_states)

        if opt.use_amp:
            amp.init_trainer(trainer)

        L = gluon.loss.SoftmaxCrossEntropyLoss()

        best_val_score = 0
        lr_decay_count = 0
        #compute weights
        weights = get_weights(opt).reshape(1, opt.num_classes)
        weights = mx.nd.array(weights, ctx=mx.gpu(0))

        for epoch in range(opt.resume_epoch, opt.num_epochs):
            tic = time.time()
            train_metric.reset()
            btic = time.time()
            num_train_iter = len(train_data)
            train_loss_epoch = 0
            train_loss_iter = 0

            for i, batch in tqdm(enumerate(train_data)):
                data, label = batch_fn(batch, ctx)

                with ag.record():
                    outputs = []
                    for _, X in enumerate(data):
                        X = X.reshape((-1, ) + X.shape[2:])
                        # pred = net(X.astype(opt.dtype, copy=False))
                        pred = net(X)
                        outputs.append(pred)
                    if (opt.balanced):
                        loss = [
                            L(yhat, y.astype(opt.dtype, copy=False), weights)
                            for yhat, y in zip(outputs, label)
                        ]

                    else:
                        loss = [
                            L(yhat, y.astype(opt.dtype, copy=False))
                            for yhat, y in zip(outputs, label)
                        ]

                    if opt.use_amp:
                        with amp.scale_loss(loss, trainer) as scaled_loss:
                            ag.backward(scaled_loss)
                    else:
                        ag.backward(loss)

                if opt.accumulate > 1:
                    if (i + 1) % opt.accumulate == 0:
                        trainer.step(batch_size * opt.accumulate)
                        net.collect_params().zero_grad()
                else:
                    trainer.step(batch_size)

                train_metric.update(label, outputs)
                train_loss_iter = sum([l.mean().asscalar()
                                       for l in loss]) / len(loss)
                train_loss_epoch += train_loss_iter

                train_metric_name, train_metric_score = train_metric.get()
                sw.add_scalar(tag='train_acc_top1_iter',
                              value=train_metric_score * 100,
                              global_step=epoch * num_train_iter + i)
                sw.add_scalar(tag='train_loss_iter',
                              value=train_loss_iter,
                              global_step=epoch * num_train_iter + i)
                sw.add_scalar(tag='learning_rate_iter',
                              value=trainer.learning_rate,
                              global_step=epoch * num_train_iter + i)

                if opt.log_interval and not (i + 1) % opt.log_interval:
                    logger.info(
                        'Epoch[%03d] Batch [%04d]/[%04d]\tSpeed: %f samples/sec\t %s=%f\t loss=%f\t lr=%f'
                        % (epoch, i, num_train_iter,
                           batch_size * opt.log_interval /
                           (time.time() - btic), train_metric_name,
                           train_metric_score * 100, train_loss_epoch /
                           (i + 1), trainer.learning_rate))
                    btic = time.time()

            train_metric_name, train_metric_score = train_metric.get()
            throughput = int(batch_size * i / (time.time() - tic))
            mx.ndarray.waitall()

            logger.info('[Epoch %03d] training: %s=%f\t loss=%f' %
                        (epoch, train_metric_name, train_metric_score * 100,
                         train_loss_epoch / num_train_iter))
            logger.info('[Epoch %03d] speed: %d samples/sec\ttime cost: %f' %
                        (epoch, throughput, time.time() - tic))
            sw.add_scalar(tag='train_loss_epoch',
                          value=train_loss_epoch / num_train_iter,
                          global_step=epoch)

            if not opt.train_only:
                acc_top1_val, acc_top5_val, loss_val = test(ctx, val_data)

                logger.info(
                    '[Epoch %03d] validation: acc-top1=%f acc-top5=%f loss=%f'
                    %
                    (epoch, acc_top1_val * 100, acc_top5_val * 100, loss_val))
                sw.add_scalar(tag='val_loss_epoch',
                              value=loss_val,
                              global_step=epoch)
                sw.add_scalar(tag='val_acc_top1_epoch',
                              value=acc_top1_val * 100,
                              global_step=epoch)

                if acc_top1_val > best_val_score:
                    best_val_score = acc_top1_val
                    net.save_parameters('%s/%.4f-%s-%s-%03d-best.params' %
                                        (opt.save_dir, best_val_score,
                                         opt.dataset, model_name, epoch))
                    trainer.save_states('%s/%.4f-%s-%s-%03d-best.states' %
                                        (opt.save_dir, best_val_score,
                                         opt.dataset, model_name, epoch))
                # else:
                #     if opt.save_frequency and opt.save_dir and (epoch + 1) % opt.save_frequency == 0:
                #         net.save_parameters('%s/%s-%s-%03d.params'%(opt.save_dir, opt.dataset, model_name, epoch))
                #         trainer.save_states('%s/%s-%s-%03d.states'%(opt.save_dir, opt.dataset, model_name, epoch))

        # # save the last model
        # net.save_parameters('%s/%s-%s-%03d.params'%(opt.save_dir, opt.dataset, model_name, opt.num_epochs-1))
        # trainer.save_states('%s/%s-%s-%03d.states'%(opt.save_dir, opt.dataset, model_name, opt.num_epochs-1))
        def return_float(el):
            return float(el)

        try:
            #remove "trash" files
            performances = [
                get_file_stem(file).split("-")[0]
                for file in os.listdir(opt.save_dir) if "params" in file
            ]

            best_performance = sorted(performances,
                                      key=return_float,
                                      reverse=True)[0]

            params_trash = [
                os.path.join(opt.save_dir, file)
                for file in os.listdir(opt.save_dir)
                if (("params" in file) and not (best_performance in file))
            ]
            states_trash = [
                os.path.join(opt.save_dir, file)
                for file in os.listdir(opt.save_dir)
                if (("states" in file) and not (best_performance in file))
            ]
            trash_files = params_trash + states_trash

            for file in trash_files:
                os.remove(file)
        except:
            print("Sth went wrong...")

    if opt.mode == 'hybrid':
        net.hybridize(static_alloc=True, static_shape=True)

    train(context)
    sw.close()
Exemple #6
0
def run(mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225],
        anchor_alloc_size=[256, 256],
        box_sizes=[21, 51.2, 133.12, 215.04, 296.96, 378.88, 460.8, 542.72],
        box_ratios=[[1, 2, 0.5]] + [[1, 2, 0.5, 3, 1.0 / 3]] * 4 +
        [[1, 2, 0.5]] * 2,
        anchor_box_clip=True,
        graphviz=True,
        epoch=100,
        input_size=[400, 600],
        batch_log=100,
        batch_size=16,
        batch_interval=10,
        subdivision=4,
        train_dataset_path="Dataset/train",
        valid_dataset_path="Dataset/valid",
        multiscale=True,
        factor_scale=[8, 5],
        foreground_iou_thresh=0.5,
        data_augmentation=True,
        num_workers=4,
        optimizer="ADAM",
        save_period=10,
        load_period=10,
        learning_rate=0.001,
        decay_lr=0.999,
        decay_step=10,
        GPU_COUNT=0,
        base="VGG16_512",
        pretrained_base=True,
        pretrained_path="modelparam",
        classHardNegativeMining=True,
        boxHardNegativeMining=True,
        AMP=True,
        valid_size=8,
        eval_period=5,
        tensorboard=True,
        valid_graph_path="valid_Graph",
        using_mlflow=True,
        decode_number=-1,
        multiperclass=True,
        nms_thresh=0.45,
        nms_topk=500,
        iou_thresh=0.5,
        except_class_thresh=0.01,
        plot_class_thresh=0.5):
    if GPU_COUNT == 0:
        ctx = mx.cpu(0)
        AMP = False
    elif GPU_COUNT == 1:
        ctx = mx.gpu(0)
    else:
        ctx = [mx.gpu(i) for i in range(GPU_COUNT)]

    # 운영체제 확인
    if platform.system() == "Linux":
        logging.info(f"{platform.system()} OS")
    elif platform.system() == "Windows":
        logging.info(f"{platform.system()} OS")
    else:
        logging.info(f"{platform.system()} OS")

    if isinstance(ctx, (list, tuple)):
        for i, c in enumerate(ctx):
            free_memory, total_memory = mx.context.gpu_memory_info(i)
            free_memory = round(free_memory / (1024 * 1024 * 1024), 2)
            total_memory = round(total_memory / (1024 * 1024 * 1024), 2)
            logging.info(
                f'Running on {c} / free memory : {free_memory}GB / total memory {total_memory}GB'
            )
    else:
        if GPU_COUNT == 1:
            free_memory, total_memory = mx.context.gpu_memory_info(0)
            free_memory = round(free_memory / (1024 * 1024 * 1024), 2)
            total_memory = round(total_memory / (1024 * 1024 * 1024), 2)
            logging.info(
                f'Running on {ctx} / free memory : {free_memory}GB / total memory {total_memory}GB'
            )
        else:
            logging.info(f'Running on {ctx}')

    if GPU_COUNT > 0 and batch_size < GPU_COUNT:
        logging.info("batch size must be greater than gpu number")
        exit(0)

    if AMP:
        amp.init()

    if multiscale:
        logging.info("Using MultiScale")

    if data_augmentation:
        logging.info("Using Data Augmentation")

    logging.info("training SSD Detector")
    input_shape = (1, 3) + tuple(input_size)

    try:
        if base.upper() == "VGG16_300":  # 입력 사이즈 300 x 300 추천
            net = SSD_VGG16(version=300,
                            input_size=input_size,
                            box_sizes=box_sizes,
                            box_ratios=box_ratios,
                            anchor_box_clip=anchor_box_clip,
                            alloc_size=anchor_alloc_size,
                            ctx=mx.cpu())
        elif base.upper() == "VGG16_512":  # 입력 사이즈 512 x 512 추천
            net = SSD_VGG16(version=512,
                            input_size=input_size,
                            box_sizes=box_sizes,
                            box_ratios=box_ratios,
                            anchor_box_clip=anchor_box_clip,
                            ctx=mx.cpu())
        train_dataloader, train_dataset = traindataloader(
            multiscale=multiscale,
            factor_scale=factor_scale,
            augmentation=data_augmentation,
            path=train_dataset_path,
            input_size=input_size,
            batch_size=batch_size,
            batch_interval=batch_interval,
            num_workers=num_workers,
            shuffle=True,
            mean=mean,
            std=std,
            net=net,
            foreground_iou_thresh=foreground_iou_thresh,
            make_target=True)
        valid_dataloader, valid_dataset = validdataloader(
            path=valid_dataset_path,
            input_size=input_size,
            batch_size=valid_size,
            num_workers=num_workers,
            shuffle=True,
            mean=mean,
            std=std,
            net=net,
            foreground_iou_thresh=foreground_iou_thresh,
            make_target=True)
    except Exception:
        logging.info("dataset 없음")
        exit(0)

    train_update_number_per_epoch = len(train_dataloader)
    if train_update_number_per_epoch < 1:
        logging.warning("train batch size가 데이터 수보다 큼")
        exit(0)

    valid_list = glob.glob(os.path.join(valid_dataset_path, "*"))
    if valid_list:
        valid_update_number_per_epoch = len(valid_dataloader)
        if valid_update_number_per_epoch < 1:
            logging.warning("valid batch size가 데이터 수보다 큼")
            exit(0)

    num_classes = train_dataset.num_class  # 클래스 수
    name_classes = train_dataset.classes

    # 이름 다시 붙이기
    optimizer = optimizer.upper()
    base = base.upper()
    if pretrained_base:
        model = str(input_size[0]) + "_" + str(
            input_size[1]) + "_" + optimizer + "_P" + base
    else:
        model = str(input_size[0]) + "_" + str(
            input_size[1]) + "_" + optimizer + "_" + base

    weight_path = f"weights/{model}"
    sym_path = os.path.join(weight_path, f'{model}-symbol.json')
    param_path = os.path.join(weight_path, f'{model}-{load_period:04d}.params')

    if os.path.exists(param_path) and os.path.exists(sym_path):
        start_epoch = load_period
        logging.info(f"loading {os.path.basename(param_path)} weights\n")
        net = gluon.SymbolBlock.imports(sym_path, ['data'],
                                        param_path,
                                        ctx=ctx)
    else:
        start_epoch = 0
        if base.upper() == "VGG16_300":  # 입력 사이즈 300 x 300 추천
            net = SSD_VGG16(
                version=300,
                input_size=input_size,
                # box_sizes=[21, 45, 101.25, 157.5, 213.75, 270, 326.25],
                # box_ratios=[[1, 2, 0.5]] +  # conv4_3
                #            [[1, 2, 0.5, 3, 1.0 / 3]] * 3 +  # conv7, conv8_2, conv9_2, conv10_2
                #            [[1, 2, 0.5]] * 2,  # conv11_2, conv12_2
                box_sizes=box_sizes,
                box_ratios=box_ratios,
                num_classes=num_classes,
                pretrained=pretrained_base,
                pretrained_path=pretrained_path,
                anchor_box_clip=anchor_box_clip,
                alloc_size=anchor_alloc_size,
                ctx=ctx)

        elif base.upper() == "VGG16_512":  # 입력 사이즈 512 x 512 추천
            net = SSD_VGG16(
                version=512,
                input_size=input_size,
                # box_sizes=[21, 51.2, 133.12, 215.04, 296.96, 378.88, 460.8, 542.72],
                # box_ratios=[[1, 2, 0.5]] +  # conv4_3
                #            [[1, 2, 0.5, 3, 1.0 / 3]] * 4 +  # conv7, conv8_2, conv9_2, conv10_2
                #            [[1, 2, 0.5]] * 2,  # conv11_2, conv12_2
                box_sizes=box_sizes,
                box_ratios=box_ratios,
                num_classes=num_classes,
                pretrained=pretrained_base,
                pretrained_path=pretrained_path,
                anchor_box_clip=anchor_box_clip,
                ctx=ctx)
        else:
            logging.warning("backbone 없음")
            exit(0)

        if isinstance(ctx, (list, tuple)):
            net.summary(mx.nd.ones(shape=input_shape, ctx=ctx[0]))
        else:
            net.summary(mx.nd.ones(shape=input_shape, ctx=ctx))
        '''
        active (bool, default True) – Whether to turn hybrid on or off.
        static_alloc (bool, default False) – Statically allocate memory to improve speed. Memory usage may increase.
        static_shape (bool, default False) – Optimize for invariant input shapes between iterations. Must also set static_alloc to True. Change of input shapes is still allowed but slower.
        '''
        if multiscale:
            net.hybridize(active=True, static_alloc=True, static_shape=False)
        else:
            net.hybridize(active=True, static_alloc=True, static_shape=True)

    if start_epoch + 1 >= epoch + 1:
        logging.info("this model has already been optimized")
        exit(0)

    if tensorboard:
        summary = SummaryWriter(logdir=os.path.join("mxboard", model),
                                max_queue=10,
                                flush_secs=10,
                                verbose=False)
        if isinstance(ctx, (list, tuple)):
            net.forward(mx.nd.ones(shape=input_shape, ctx=ctx[0]))
        else:
            net.forward(mx.nd.ones(shape=input_shape, ctx=ctx))
        summary.add_graph(net)

    if graphviz:
        gluoncv.utils.viz.plot_network(net,
                                       shape=input_shape,
                                       save_prefix=model)

    # optimizer
    unit = 1 if (len(train_dataset) //
                 batch_size) < 1 else len(train_dataset) // batch_size
    step = unit * decay_step
    lr_sch = mx.lr_scheduler.FactorScheduler(step=step,
                                             factor=decay_lr,
                                             stop_factor_lr=1e-12,
                                             base_lr=learning_rate)

    for p in net.collect_params().values():
        if p.grad_req != "null":
            p.grad_req = 'add'

    if AMP:
        '''
        update_on_kvstore : bool, default None
        Whether to perform parameter updates on kvstore. If None, then trainer will choose the more
        suitable option depending on the type of kvstore. If the `update_on_kvstore` argument is
        provided, environment variable `MXNET_UPDATE_ON_KVSTORE` will be ignored.
        '''
        if optimizer.upper() == "ADAM":
            trainer = gluon.Trainer(
                net.collect_params(),
                optimizer,
                optimizer_params={
                    "learning_rate": learning_rate,
                    "lr_scheduler": lr_sch,
                    "beta1": 0.9,
                    "beta2": 0.999,
                    'multi_precision': False
                },
                update_on_kvstore=False)  # for Dynamic loss scaling
        elif optimizer.upper() == "RMSPROP":
            trainer = gluon.Trainer(
                net.collect_params(),
                optimizer,
                optimizer_params={
                    "learning_rate": learning_rate,
                    "lr_scheduler": lr_sch,
                    "gamma1": 0.9,
                    "gamma2": 0.999,
                    'multi_precision': False
                },
                update_on_kvstore=False)  # for Dynamic loss scaling
        elif optimizer.upper() == "SGD":
            trainer = gluon.Trainer(
                net.collect_params(),
                optimizer,
                optimizer_params={
                    "learning_rate": learning_rate,
                    "lr_scheduler": lr_sch,
                    "wd": 0.0005,
                    "momentum": 0.9,
                    'multi_precision': False
                },
                update_on_kvstore=False)  # for Dynamic loss scaling
        else:
            logging.error("optimizer not selected")
            exit(0)

        amp.init_trainer(trainer)

    else:
        if optimizer.upper() == "ADAM":
            trainer = gluon.Trainer(net.collect_params(),
                                    optimizer,
                                    optimizer_params={
                                        "learning_rate": learning_rate,
                                        "lr_scheduler": lr_sch,
                                        "beta1": 0.9,
                                        "beta2": 0.999,
                                        'multi_precision': False
                                    })
        elif optimizer.upper() == "RMSPROP":
            trainer = gluon.Trainer(net.collect_params(),
                                    optimizer,
                                    optimizer_params={
                                        "learning_rate": learning_rate,
                                        "lr_scheduler": lr_sch,
                                        "gamma1": 0.9,
                                        "gamma2": 0.999,
                                        'multi_precision': False
                                    })
        elif optimizer.upper() == "SGD":
            trainer = gluon.Trainer(net.collect_params(),
                                    optimizer,
                                    optimizer_params={
                                        "learning_rate": learning_rate,
                                        "lr_scheduler": lr_sch,
                                        "wd": 0.0005,
                                        "momentum": 0.9,
                                        'multi_precision': False
                                    })

        else:
            logging.error("optimizer not selected")
            exit(0)
    '''
    localization loss -> Smooth L1 loss
    confidence loss -> Softmax
    '''
    if not classHardNegativeMining:
        confidence_loss = SoftmaxCrossEntropyLoss(axis=-1,
                                                  sparse_label=True,
                                                  from_log_softmax=False,
                                                  batch_axis=None,
                                                  reduction="sum",
                                                  exclude=False)
    if not boxHardNegativeMining:
        localization_loss = HuberLoss(rho=1,
                                      batch_axis=None,
                                      reduction="sum",
                                      exclude=False)

    prediction = Prediction(from_softmax=False,
                            num_classes=num_classes,
                            decode_number=decode_number,
                            nms_thresh=nms_thresh,
                            nms_topk=nms_topk,
                            except_class_thresh=except_class_thresh,
                            multiperclass=multiperclass)

    precision_recall = Voc_2007_AP(iou_thresh=iou_thresh,
                                   class_names=name_classes)

    start_time = time.time()

    for i in tqdm(range(start_epoch + 1, epoch + 1, 1),
                  initial=start_epoch + 1,
                  total=epoch):

        conf_loss_sum = 0
        loc_loss_sum = 0
        time_stamp = time.time()

        for batch_count, (image, _, cls_all, box_all,
                          _) in enumerate(train_dataloader, start=1):
            td_batch_size = image.shape[0]

            image = mx.nd.split(data=image, num_outputs=subdivision, axis=0)
            cls_all = mx.nd.split(data=cls_all,
                                  num_outputs=subdivision,
                                  axis=0)
            box_all = mx.nd.split(data=box_all,
                                  num_outputs=subdivision,
                                  axis=0)

            if subdivision == 1:
                image = [image]
                cls_t_all = [cls_t_all]
                box_t_all = [box_t_all]

            with autograd.record(train_mode=True):

                cls_all_losses = []
                box_all_losses = []

                for image_split, cls_split, box_split in zip(
                        image, cls_all, box_all):

                    if GPU_COUNT <= 1:
                        image_split = gluon.utils.split_and_load(
                            image_split, [ctx], even_split=False)
                        cls_split = gluon.utils.split_and_load(
                            cls_split, [ctx], even_split=False)
                        box_split = gluon.utils.split_and_load(
                            box_split, [ctx], even_split=False)
                    else:
                        image_split = gluon.utils.split_and_load(
                            image_split, ctx, even_split=False)
                        cls_split = gluon.utils.split_and_load(
                            cls_split, ctx, even_split=False)
                        box_split = gluon.utils.split_and_load(
                            box_split, ctx, even_split=False)

                    # prediction, target space for Data Parallelism
                    cls_losses = []
                    box_losses = []
                    total_loss = []

                    # gpu N 개를 대비한 코드 (Data Parallelism)
                    for img, cls_target, box_target in zip(
                            image_split, cls_split, box_split):
                        # 1. SSD network Inference
                        cls_pred, box_pred, anchor = net(img)
                        '''
                            4. Hard negative mining (class에만 loss 계산)
                            Hard negative mining After the matching step, most of the default boxes are negatives,
                            especially when the number of possible default boxes is large. This introduces a
                            significant imbalance between the positive and negative training examples. Instead of
                            using all the negative examples, we sort them using the highest confidence loss for each
                            default box and pick the top ones so that the ratio between the negatives and positives is
                            at most 3:1. We found that this leads to faster optimization and a more stable training
                        '''
                        weight_term_alpha = 1
                        negative_mining_ratio = 3
                        positive_samples = cls_target > 0  # True or False
                        positive_numbers = positive_samples.sum()
                        if classHardNegativeMining:
                            pred = mx.nd.log_softmax(cls_pred, axis=-1)
                            negative_samples = 1 - positive_samples
                            conf_loss = -mx.nd.pick(
                                pred, cls_target,
                                axis=-1)  # (batch, all feature number)
                            '''
                            we sort them using the highest confidence loss for each
                            default box and pick the top ones so that the ratio between the negatives and positives is
                            at most 3:1.
                            '''
                            negative_samples_conf_loss = (conf_loss *
                                                          negative_samples)
                            # 아래 3줄의 코드 출처 : from gluoncv.loss import SSDMultiBoxLoss
                            negative_samples_index = mx.nd.argsort(
                                negative_samples_conf_loss,
                                axis=-1,
                                is_ascend=False)
                            selection = mx.nd.argsort(negative_samples_index,
                                                      axis=-1,
                                                      is_ascend=True)
                            hard_negative_samples = selection <= mx.nd.multiply(
                                positive_numbers,
                                negative_mining_ratio).expand_dims(-1)
                            pos_hardnega = positive_samples + hard_negative_samples
                            conf_loss = mx.nd.where(
                                pos_hardnega > 0, conf_loss,
                                mx.nd.zeros_like(conf_loss))
                            conf_loss = mx.nd.sum(conf_loss)
                            if positive_numbers:
                                conf_loss = mx.nd.divide(
                                    conf_loss, positive_numbers)
                            else:
                                conf_loss = mx.nd.multiply(conf_loss, 0)
                            cls_losses.append(conf_loss.asscalar())
                        else:
                            conf_loss = confidence_loss(
                                cls_pred, cls_target,
                                positive_samples.expand_dims(axis=-1))
                            if positive_numbers:
                                conf_loss = mx.nd.divide(
                                    conf_loss, positive_numbers)
                            else:
                                conf_loss = mx.nd.multiply(conf_loss, 0)
                            cls_losses.append(conf_loss.asscalar())

                        if boxHardNegativeMining:
                            # loc loss에도 hard HardNegativeMining 적용해보자.
                            pred = mx.nd.log_softmax(cls_pred, axis=-1)
                            negative_samples = 1 - positive_samples
                            conf_loss_for_box = -mx.nd.pick(
                                pred, cls_target,
                                axis=-1)  # (batch, all feature number)
                            negative_samples_conf_loss = (conf_loss_for_box *
                                                          negative_samples)
                            negative_samples_index = mx.nd.argsort(
                                negative_samples_conf_loss,
                                axis=-1,
                                is_ascend=False)
                            selection = mx.nd.argsort(negative_samples_index,
                                                      axis=-1,
                                                      is_ascend=True)
                            hard_negative_samples = selection <= mx.nd.multiply(
                                positive_numbers,
                                negative_mining_ratio).expand_dims(-1)
                            pos_hardnega = positive_samples + hard_negative_samples
                            pos_hardnega = mx.nd.repeat(
                                pos_hardnega.reshape(shape=(0, 0, 1)),
                                repeats=4,
                                axis=-1)

                            loc_loss = mx.nd.abs(box_pred - box_target)
                            loc_loss = mx.nd.where(loc_loss > 1,
                                                   loc_loss - 0.5, (0.5 / 1) *
                                                   mx.nd.square(loc_loss))
                            loc_loss = mx.nd.where(pos_hardnega > 0, loc_loss,
                                                   mx.nd.zeros_like(loc_loss))
                            loc_loss = mx.nd.sum(loc_loss)
                            if positive_numbers:
                                loc_loss = mx.nd.divide(
                                    loc_loss, positive_numbers)
                            else:
                                loc_loss = mx.nd.multiply(loc_loss, 0)
                            box_losses.append(loc_loss.asscalar())
                        else:
                            loc_loss = localization_loss(
                                box_pred, box_target,
                                positive_samples.expand_dims(axis=-1))
                            if positive_numbers:
                                loc_loss = mx.nd.divide(
                                    loc_loss, positive_numbers)
                            else:
                                loc_loss = mx.nd.multiply(loc_loss, 0)
                            box_losses.append(loc_loss.asscalar())

                        total_loss.append(conf_loss +
                                          weight_term_alpha * loc_loss)
                    if AMP:
                        with amp.scale_loss(total_loss,
                                            trainer) as scaled_loss:
                            autograd.backward(scaled_loss)
                    else:
                        autograd.backward(total_loss)

                    cls_all_losses.append(sum(cls_losses))
                    box_all_losses.append(sum(box_losses))

            trainer.step(batch_size=td_batch_size, ignore_stale_grad=False)
            # 비우기
            for p in net.collect_params().values():
                p.zero_grad()

            conf_loss_sum += sum(cls_all_losses) / td_batch_size
            loc_loss_sum += sum(box_all_losses) / td_batch_size

            if batch_count % batch_log == 0:
                logging.info(
                    f'[Epoch {i}][Batch {batch_count}/{train_update_number_per_epoch}],'
                    f'[Speed {td_batch_size / (time.time() - time_stamp):.3f} samples/sec],'
                    f'[Lr = {trainer.learning_rate}]'
                    f'[confidence loss = {sum(cls_all_losses) / td_batch_size:.3f}]'
                    f'[localization loss = {sum(box_all_losses) / td_batch_size:.3f}]'
                )
            time_stamp = time.time()

        train_conf_loss_mean = np.divide(conf_loss_sum,
                                         train_update_number_per_epoch)
        train_loc_loss_mean = np.divide(loc_loss_sum,
                                        train_update_number_per_epoch)
        train_total_loss_mean = train_conf_loss_mean + train_loc_loss_mean

        logging.info(
            f"train confidence loss : {train_conf_loss_mean} / train localization loss : {train_loc_loss_mean} / train total loss : {train_total_loss_mean}"
        )

        if i % eval_period == 0 and valid_list:

            if classHardNegativeMining:
                confidence_loss = SoftmaxCrossEntropyLoss(
                    axis=-1,
                    sparse_label=True,
                    from_log_softmax=False,
                    batch_axis=None,
                    reduction="sum",
                    exclude=False)
            if boxHardNegativeMining:
                localization_loss = HuberLoss(rho=1,
                                              batch_axis=None,
                                              reduction="sum",
                                              exclude=False)

            conf_loss_sum = 0
            loc_loss_sum = 0
            for image, label, cls_all, box_all, _ in valid_dataloader:

                vd_batch_size = image.shape[0]
                if GPU_COUNT <= 1:
                    image = gluon.utils.split_and_load(image, [ctx],
                                                       even_split=False)
                    label = gluon.utils.split_and_load(label, [ctx],
                                                       even_split=False)
                    cls_all = gluon.utils.split_and_load(cls_all, [ctx],
                                                         even_split=False)
                    box_all = gluon.utils.split_and_load(box_all, [ctx],
                                                         even_split=False)
                else:
                    image = gluon.utils.split_and_load(image,
                                                       ctx,
                                                       even_split=False)
                    label = gluon.utils.split_and_load(label,
                                                       ctx,
                                                       even_split=False)
                    cls_all = gluon.utils.split_and_load(cls_all, [ctx],
                                                         even_split=False)
                    box_all = gluon.utils.split_and_load(box_all, [ctx],
                                                         even_split=False)

                # prediction, target space for Data Parallelism
                cls_losses = []
                box_losses = []

                # gpu N 개를 대비한 코드 (Data Parallelism)
                for img, lb, cls_target, box_target in zip(
                        image, label, cls_all, box_all):
                    gt_box = lb[:, :, :4]
                    gt_id = lb[:, :, 4:5]
                    cls_pred, box_pred, anchor = net(img)
                    id, score, bbox = prediction(cls_pred, box_pred, anchor)

                    precision_recall.update(pred_bboxes=bbox,
                                            pred_labels=id,
                                            pred_scores=score,
                                            gt_boxes=gt_box,
                                            gt_labels=gt_id)

                    positive_samples = cls_target > 0
                    positive_numbers = positive_samples.sum()

                    conf_loss = confidence_loss(
                        cls_pred, cls_target,
                        positive_samples.expand_dims(axis=-1))
                    if positive_numbers:
                        conf_loss = mx.nd.divide(conf_loss, positive_numbers)
                    else:
                        conf_loss = mx.nd.multiply(conf_loss, 0)
                    cls_losses.append(conf_loss.asscalar())

                    loc_loss = localization_loss(
                        box_pred, box_target,
                        positive_samples.expand_dims(axis=-1))
                    if positive_numbers:
                        loc_loss = mx.nd.divide(loc_loss, positive_numbers)
                    else:
                        loc_loss = mx.nd.multiply(loc_loss, 0)
                    box_losses.append(loc_loss.asscalar())

                conf_loss_sum += sum(cls_losses) / vd_batch_size
                loc_loss_sum += sum(box_losses) / vd_batch_size

            valid_conf_loss_mean = np.divide(conf_loss_sum,
                                             valid_update_number_per_epoch)
            valid_loc_loss_mean = np.divide(loc_loss_sum,
                                            valid_update_number_per_epoch)
            valid_total_loss_mean = valid_conf_loss_mean + valid_loc_loss_mean

            logging.info(
                f"valid confidence loss : {valid_conf_loss_mean} / valid localization loss : {valid_loc_loss_mean} / valid total loss : {valid_total_loss_mean}"
            )

            AP_appender = []
            round_position = 2
            class_name, precision, recall, true_positive, false_positive, threshold = precision_recall.get_PR_list(
            )
            for j, c, p, r in zip(range(len(recall)), class_name, precision,
                                  recall):
                name, AP = precision_recall.get_AP(c, p, r)
                logging.info(
                    f"class {j}'s {name} AP : {round(AP * 100, round_position)}%"
                )
                AP_appender.append(AP)
            mAP_result = np.mean(AP_appender)

            logging.info(f"mAP : {round(mAP_result * 100, round_position)}%")
            precision_recall.get_PR_curve(name=class_name,
                                          precision=precision,
                                          recall=recall,
                                          threshold=threshold,
                                          AP=AP_appender,
                                          mAP=mAP_result,
                                          folder_name=valid_graph_path,
                                          epoch=i)

            precision_recall.reset()

            if tensorboard:
                # gpu N 개를 대비한 코드 (Data Parallelism)
                dataloader_iter = iter(valid_dataloader)
                image, label, _, _, _ = next(dataloader_iter)
                if GPU_COUNT <= 1:
                    image = gluon.utils.split_and_load(image, [ctx],
                                                       even_split=False)
                    label = gluon.utils.split_and_load(label, [ctx],
                                                       even_split=False)
                else:
                    image = gluon.utils.split_and_load(image,
                                                       ctx,
                                                       even_split=False)
                    label = gluon.utils.split_and_load(label,
                                                       ctx,
                                                       even_split=False)

                ground_truth_colors = {}
                for k in range(num_classes):
                    ground_truth_colors[k] = (0, 0, 1)

                batch_image = []
                for img, lb in zip(image, label):
                    gt_boxes = lb[:, :, :4]
                    gt_ids = lb[:, :, 4:5]
                    cls_pred, box_pred, anchor = net(img)
                    ids, scores, bboxes = prediction(cls_pred, box_pred,
                                                     anchor)

                    for ig, gt_id, gt_box, id, score, bbox in zip(
                            img, gt_ids, gt_boxes, ids, scores, bboxes):
                        ig = ig.transpose((1, 2, 0)) * mx.nd.array(
                            std, ctx=ig.context) + mx.nd.array(mean,
                                                               ctx=ig.context)
                        ig = (ig * 255).clip(0, 255)

                        # ground truth box 그리기
                        ground_truth = plot_bbox(
                            ig,
                            gt_box,
                            scores=None,
                            labels=gt_id,
                            thresh=None,
                            reverse_rgb=True,
                            class_names=valid_dataset.classes,
                            absolute_coordinates=True,
                            colors=ground_truth_colors)
                        # prediction box 그리기
                        prediction_box = plot_bbox(
                            ground_truth,
                            bbox,
                            scores=score,
                            labels=id,
                            thresh=plot_class_thresh,
                            reverse_rgb=False,
                            class_names=valid_dataset.classes,
                            absolute_coordinates=True)

                        # Tensorboard에 그리기 위해 BGR -> RGB / (height, width, channel) -> (channel, height, width) 를한다.
                        prediction_box = cv2.cvtColor(prediction_box,
                                                      cv2.COLOR_BGR2RGB)
                        prediction_box = np.transpose(prediction_box,
                                                      axes=(2, 0, 1))
                        batch_image.append(
                            prediction_box)  # (batch, channel, height, width)

                summary.add_image(tag="valid_result",
                                  image=np.array(batch_image),
                                  global_step=i)
                summary.add_scalar(tag="conf_loss",
                                   value={
                                       "train_conf_loss": train_conf_loss_mean,
                                       "valid_conf_loss": valid_conf_loss_mean
                                   },
                                   global_step=i)
                summary.add_scalar(tag="loc_loss",
                                   value={
                                       "train_loc_loss": train_loc_loss_mean,
                                       "valid_loc_loss": valid_loc_loss_mean
                                   },
                                   global_step=i)
                summary.add_scalar(tag="total_loss",
                                   value={
                                       "train_total_loss":
                                       train_total_loss_mean,
                                       "valid_total_loss":
                                       valid_total_loss_mean
                                   },
                                   global_step=i)

                params = net.collect_params().values()
                if GPU_COUNT > 1:
                    for c in ctx:
                        for p in params:
                            summary.add_histogram(tag=p.name,
                                                  values=p.data(ctx=c),
                                                  global_step=i,
                                                  bins='default')
                else:
                    for p in params:
                        summary.add_histogram(tag=p.name,
                                              values=p.data(),
                                              global_step=i,
                                              bins='default')

        if i % save_period == 0:

            weight_epoch_path = os.path.join(weight_path, str(i))
            if not os.path.exists(weight_epoch_path):
                os.makedirs(weight_epoch_path)
            '''
            Hybrid models can be serialized as JSON files using the export function
            Export HybridBlock to json format that can be loaded by SymbolBlock.imports, mxnet.mod.Module or the C++ interface.
            When there are only one input, it will have name data. When there Are more than one inputs, they will be named as data0, data1, etc.
            '''

            if GPU_COUNT >= 1:
                context = mx.gpu(0)
            else:
                context = mx.cpu(0)

            postnet = PostNet(net=net, auxnet=prediction)

            try:
                net.export(os.path.join(weight_path, f"{model}"),
                           epoch=i,
                           remove_amp_cast=True)
                net.save_parameters(os.path.join(weight_path,
                                                 f"{i}.params"))  # onnx 추출용
                # network inference, decoder, nms까지 처리됨 - mxnet c++에서 편리함 / onnx로는 추출 못함.
                export_block_for_cplusplus(
                    path=os.path.join(weight_epoch_path, f"{model}_prepost"),
                    block=postnet,
                    data_shape=tuple(input_size) + tuple((3, )),
                    epoch=i,
                    preprocess=
                    True,  # c++ 에서 inference시 opencv에서 읽은 이미지 그대로 넣으면 됨
                    layout='HWC',
                    ctx=context,
                    remove_amp_cast=True)
            except Exception as E:
                logging.error(f"json, param model export 예외 발생 : {E}")
            else:
                logging.info("json, param model export 성공")
                net.collect_params().reset_ctx(ctx)

    end_time = time.time()
    learning_time = end_time - start_time
    logging.info(f"learning time : 약, {learning_time / 3600:0.2f}H")
    logging.info("optimization completed")

    if using_mlflow:
        ml.log_metric("learning time", round(learning_time / 3600, 2))
def main():
    opt = parse_args()

    makedirs(opt.save_dir)

    filehandler = logging.FileHandler(
        os.path.join(opt.save_dir, opt.logging_file))
    streamhandler = logging.StreamHandler()
    logger = logging.getLogger('')
    logger.setLevel(logging.INFO)
    logger.addHandler(filehandler)
    logger.addHandler(streamhandler)
    logger.info(opt)

    sw = SummaryWriter(logdir=opt.save_dir, flush_secs=5, verbose=False)

    if opt.kvstore is not None:
        kv = mx.kvstore.create(opt.kvstore)
        logger.info(
            'Distributed training with %d workers and current rank is %d' %
            (kv.num_workers, kv.rank))
    if opt.use_amp:
        amp.init()

    batch_size = opt.batch_size
    classes = opt.num_classes

    num_gpus = opt.num_gpus
    batch_size *= max(1, num_gpus)
    logger.info('Total batch size is set to %d on %d GPUs' %
                (batch_size, num_gpus))
    context = [mx.gpu(i)
               for i in range(num_gpus)] if num_gpus > 0 else [mx.cpu()]
    num_workers = opt.num_workers

    lr_decay = opt.lr_decay
    lr_decay_period = opt.lr_decay_period
    if opt.lr_decay_period > 0:
        lr_decay_epoch = list(
            range(lr_decay_period, opt.num_epochs, lr_decay_period))
    else:
        lr_decay_epoch = [int(i) for i in opt.lr_decay_epoch.split(',')]
    lr_decay_epoch = [e - opt.warmup_epochs for e in lr_decay_epoch]

    optimizer = 'sgd'
    if opt.clip_grad > 0:
        optimizer_params = {
            'learning_rate': opt.lr,
            'wd': opt.wd,
            'momentum': opt.momentum,
            'clip_gradient': opt.clip_grad
        }
    else:
        optimizer_params = {
            'learning_rate': opt.lr,
            'wd': opt.wd,
            'momentum': opt.momentum
        }

    if opt.dtype != 'float32':
        optimizer_params['multi_precision'] = True

    model_name = opt.model
    net = get_model(name=model_name,
                    nclass=classes,
                    pretrained=opt.use_pretrained,
                    use_tsn=opt.use_tsn,
                    num_segments=opt.num_segments,
                    partial_bn=opt.partial_bn)
    net.cast(opt.dtype)
    net.collect_params().reset_ctx(context)
    logger.info(net)

    if opt.resume_params is not '':
        net.load_parameters(opt.resume_params, ctx=context)

    if opt.kvstore is not None:
        train_data, val_data, batch_fn = get_data_loader(
            opt, batch_size, num_workers, logger, kv)
    else:
        train_data, val_data, batch_fn = get_data_loader(
            opt, batch_size, num_workers, logger)

    num_batches = len(train_data)
    lr_scheduler = LRSequential([
        LRScheduler('linear',
                    base_lr=0,
                    target_lr=opt.lr,
                    nepochs=opt.warmup_epochs,
                    iters_per_epoch=num_batches),
        LRScheduler(opt.lr_mode,
                    base_lr=opt.lr,
                    target_lr=0,
                    nepochs=opt.num_epochs - opt.warmup_epochs,
                    iters_per_epoch=num_batches,
                    step_epoch=lr_decay_epoch,
                    step_factor=lr_decay,
                    power=2)
    ])
    optimizer_params['lr_scheduler'] = lr_scheduler

    train_metric = mx.metric.Accuracy()
    acc_top1 = mx.metric.Accuracy()
    acc_top5 = mx.metric.TopKAccuracy(5)

    def test(ctx, val_data, kvstore=None):
        acc_top1.reset()
        acc_top5.reset()
        L = gluon.loss.SoftmaxCrossEntropyLoss()
        num_test_iter = len(val_data)
        val_loss_epoch = 0
        for i, batch in enumerate(val_data):
            data, label = batch_fn(batch, ctx)
            outputs = []
            for _, X in enumerate(data):
                X = X.reshape((-1, ) + X.shape[2:])
                pred = net(X.astype(opt.dtype, copy=False))
                outputs.append(pred)

            loss = [
                L(yhat, y.astype(opt.dtype, copy=False))
                for yhat, y in zip(outputs, label)
            ]

            acc_top1.update(label, outputs)
            acc_top5.update(label, outputs)

            val_loss_epoch += sum([l.mean().asscalar()
                                   for l in loss]) / len(loss)

            if opt.log_interval and not (i + 1) % opt.log_interval:
                logger.info('Batch [%04d]/[%04d]: evaluated' %
                            (i, num_test_iter))

        _, top1 = acc_top1.get()
        _, top5 = acc_top5.get()
        val_loss = val_loss_epoch / num_test_iter

        if kvstore is not None:
            top1_nd = nd.zeros(1)
            top5_nd = nd.zeros(1)
            val_loss_nd = nd.zeros(1)
            kvstore.push(111111, nd.array(np.array([top1])))
            kvstore.pull(111111, out=top1_nd)
            kvstore.push(555555, nd.array(np.array([top5])))
            kvstore.pull(555555, out=top5_nd)
            kvstore.push(999999, nd.array(np.array([val_loss])))
            kvstore.pull(999999, out=val_loss_nd)
            top1 = top1_nd.asnumpy() / kvstore.num_workers
            top5 = top5_nd.asnumpy() / kvstore.num_workers
            val_loss = val_loss_nd.asnumpy() / kvstore.num_workers

        return (top1, top5, val_loss)

    def train(ctx):
        if isinstance(ctx, mx.Context):
            ctx = [ctx]

        if opt.no_wd:
            for k, v in net.collect_params('.*beta|.*gamma|.*bias').items():
                v.wd_mult = 0.0

        if opt.partial_bn:
            train_patterns = None
            if 'inceptionv3' in opt.model:
                train_patterns = '.*weight|.*bias|inception30_batchnorm0_gamma|inception30_batchnorm0_beta|inception30_batchnorm0_running_mean|inception30_batchnorm0_running_var'
            else:
                logger.info(
                    'Current model does not support partial batch normalization.'
                )

            if opt.kvstore is not None:
                trainer = gluon.Trainer(net.collect_params(train_patterns),
                                        optimizer,
                                        optimizer_params,
                                        kvstore=kv,
                                        update_on_kvstore=False)
            else:
                trainer = gluon.Trainer(net.collect_params(train_patterns),
                                        optimizer,
                                        optimizer_params,
                                        update_on_kvstore=False)
        else:
            if opt.kvstore is not None:
                trainer = gluon.Trainer(net.collect_params(),
                                        optimizer,
                                        optimizer_params,
                                        kvstore=kv,
                                        update_on_kvstore=False)
            else:
                trainer = gluon.Trainer(net.collect_params(),
                                        optimizer,
                                        optimizer_params,
                                        update_on_kvstore=False)

        if opt.accumulate > 1:
            params = [
                p for p in net.collect_params().values()
                if p.grad_req != 'null'
            ]
            for p in params:
                p.grad_req = 'add'

        if opt.resume_states is not '':
            trainer.load_states(opt.resume_states)

        if opt.use_amp:
            amp.init_trainer(trainer)

        L = gluon.loss.SoftmaxCrossEntropyLoss()

        best_val_score = 0
        lr_decay_count = 0

        for epoch in range(opt.resume_epoch, opt.num_epochs):
            tic = time.time()
            train_metric.reset()
            btic = time.time()
            num_train_iter = len(train_data)
            train_loss_epoch = 0
            train_loss_iter = 0

            for i, batch in enumerate(train_data):
                data, label = batch_fn(batch, ctx)

                with ag.record():
                    outputs = []
                    for _, X in enumerate(data):
                        X = X.reshape((-1, ) + X.shape[2:])
                        pred = net(X.astype(opt.dtype, copy=False))
                        outputs.append(pred)
                    loss = [
                        L(yhat, y.astype(opt.dtype, copy=False))
                        for yhat, y in zip(outputs, label)
                    ]

                    if opt.use_amp:
                        with amp.scale_loss(loss, trainer) as scaled_loss:
                            ag.backward(scaled_loss)
                    else:
                        ag.backward(loss)

                if opt.accumulate > 1 and (i + 1) % opt.accumulate == 0:
                    if opt.kvstore is not None:
                        trainer.step(batch_size * kv.num_workers *
                                     opt.accumulate)
                    else:
                        trainer.step(batch_size * opt.accumulate)
                        net.collect_params().zero_grad()
                else:
                    if opt.kvstore is not None:
                        trainer.step(batch_size * kv.num_workers)
                    else:
                        trainer.step(batch_size)

                train_metric.update(label, outputs)
                train_loss_iter = sum([l.mean().asscalar()
                                       for l in loss]) / len(loss)
                train_loss_epoch += train_loss_iter

                train_metric_name, train_metric_score = train_metric.get()
                sw.add_scalar(tag='train_acc_top1_iter',
                              value=train_metric_score * 100,
                              global_step=epoch * num_train_iter + i)
                sw.add_scalar(tag='train_loss_iter',
                              value=train_loss_iter,
                              global_step=epoch * num_train_iter + i)
                sw.add_scalar(tag='learning_rate_iter',
                              value=trainer.learning_rate,
                              global_step=epoch * num_train_iter + i)

                if opt.log_interval and not (i + 1) % opt.log_interval:
                    logger.info(
                        'Epoch[%03d] Batch [%04d]/[%04d]\tSpeed: %f samples/sec\t %s=%f\t loss=%f\t lr=%f'
                        % (epoch, i, num_train_iter,
                           batch_size * opt.log_interval /
                           (time.time() - btic), train_metric_name,
                           train_metric_score * 100, train_loss_epoch /
                           (i + 1), trainer.learning_rate))
                    btic = time.time()

            train_metric_name, train_metric_score = train_metric.get()
            throughput = int(batch_size * i / (time.time() - tic))
            mx.ndarray.waitall()

            if opt.kvstore is not None and epoch == opt.resume_epoch:
                kv.init(111111, nd.zeros(1))
                kv.init(555555, nd.zeros(1))
                kv.init(999999, nd.zeros(1))

            if opt.kvstore is not None:
                acc_top1_val, acc_top5_val, loss_val = test(ctx, val_data, kv)
            else:
                acc_top1_val, acc_top5_val, loss_val = test(ctx, val_data)

            logger.info('[Epoch %03d] training: %s=%f\t loss=%f' %
                        (epoch, train_metric_name, train_metric_score * 100,
                         train_loss_epoch / num_train_iter))
            logger.info('[Epoch %03d] speed: %d samples/sec\ttime cost: %f' %
                        (epoch, throughput, time.time() - tic))
            logger.info(
                '[Epoch %03d] validation: acc-top1=%f acc-top5=%f loss=%f' %
                (epoch, acc_top1_val * 100, acc_top5_val * 100, loss_val))

            sw.add_scalar(tag='train_loss_epoch',
                          value=train_loss_epoch / num_train_iter,
                          global_step=epoch)
            sw.add_scalar(tag='val_loss_epoch',
                          value=loss_val,
                          global_step=epoch)
            sw.add_scalar(tag='val_acc_top1_epoch',
                          value=acc_top1_val * 100,
                          global_step=epoch)

            if acc_top1_val > best_val_score:
                best_val_score = acc_top1_val
                net.save_parameters('%s/%.4f-%s-%s-%03d-best.params' %
                                    (opt.save_dir, best_val_score, opt.dataset,
                                     model_name, epoch))
                trainer.save_states('%s/%.4f-%s-%s-%03d-best.states' %
                                    (opt.save_dir, best_val_score, opt.dataset,
                                     model_name, epoch))
            else:
                if opt.save_frequency and opt.save_dir and (
                        epoch + 1) % opt.save_frequency == 0:
                    net.save_parameters(
                        '%s/%s-%s-%03d.params' %
                        (opt.save_dir, opt.dataset, model_name, epoch))
                    trainer.save_states(
                        '%s/%s-%s-%03d.states' %
                        (opt.save_dir, opt.dataset, model_name, epoch))

        # save the last model
        net.save_parameters(
            '%s/%s-%s-%03d.params' %
            (opt.save_dir, opt.dataset, model_name, opt.num_epochs - 1))
        trainer.save_states(
            '%s/%s-%s-%03d.states' %
            (opt.save_dir, opt.dataset, model_name, opt.num_epochs - 1))

    if opt.mode == 'hybrid':
        net.hybridize(static_alloc=True, static_shape=True)

    train(context)
    sw.close()
Exemple #8
0
def main(logger, opt):
    """train model"""
    filehandler = logging.FileHandler(
        os.path.join(opt.save_dir, opt.logging_file))
    streamhandler = logging.StreamHandler()
    logger = logging.getLogger('')
    logger.setLevel(logging.INFO)
    logger.addHandler(filehandler)
    logger.addHandler(streamhandler)
    logger.info(opt)

    if opt.use_amp:
        amp.init()

    num_gpus = opt.ngpus
    batch_size = opt.batch_size * max(1, num_gpus)
    logger.info('Total batch size is set to %d on %d GPUs' %
                (batch_size, num_gpus))
    train_loader = build_data_loader(batch_size)

    # create model
    net = get_model(opt.model_name,
                    bz=opt.batch_size,
                    is_train=opt.is_train,
                    ctx=opt.ctx)
    net.cast(opt.dtype)
    logger.info(net)
    net.collect_params().reset_ctx(opt.ctx)
    if opt.resume_params is not None:
        if os.path.isfile(opt.resume_params):
            net.load_parameters(opt.resume_params, ctx=opt.ctx)
            print('Continue training from model %s.' % (opt.resume_params))
        else:
            raise RuntimeError("=> no checkpoint found at '{}'".format(
                opt.resume_params))

    # create criterion
    criterion = SiamRPNLoss(opt.batch_size)
    # optimizer and lr scheduling
    step_epoch = [10, 20, 30, 40, 50]
    num_batches = len(train_loader)
    lr_scheduler = LRSequential([
        LRScheduler(
            mode='step',
            base_lr=0.005,
            target_lr=0.01,
            nepochs=opt.warmup_epochs,
            iters_per_epoch=num_batches,
            step_epoch=step_epoch,
        ),
        LRScheduler(mode='poly',
                    base_lr=0.01,
                    target_lr=0.005,
                    nepochs=opt.epochs - opt.warmup_epochs,
                    iters_per_epoch=num_batches,
                    step_epoch=[e - opt.warmup_epochs for e in step_epoch],
                    power=0.02)
    ])

    optimizer_params = {
        'lr_scheduler': lr_scheduler,
        'wd': opt.weight_decay,
        'momentum': opt.momentum,
        'learning_rate': opt.lr
    }

    if opt.dtype == 'float32':
        optimizer_params['multi_precision'] = True

    if opt.use_amp:
        amp.init_trainer(optimizer_params)

    if opt.no_wd:
        for k, v in net.module.collect_params('.*beta|.*gamma|.*bias').items():
            v.wd_mult = 0.0

    if opt.mode == 'hybrid':
        net.hybridize(static_alloc=True, static_shape=True)

    optimizer = gluon.Trainer(net.collect_params(), 'sgd', optimizer_params)

    if opt.accumulate > 1:
        params = [
            p for p in net.collect_params().values() if p.grad_req != 'null'
        ]
        for p in params:
            p.grad_req = 'add'

    train(opt, net, train_loader, criterion, optimizer, batch_size, logger)
Exemple #9
0
def main():
    import matplotlib
    matplotlib.use('Agg')
    import matplotlib.pyplot as plt

    opt = parse_args()
    batch_size = opt.batch_size
    classes = 10

    num_gpus = opt.num_gpus
    batch_size *= max(1, num_gpus)
    context = [mx.gpu(i)
               for i in range(num_gpus)] if num_gpus > 0 else [mx.cpu()]
    num_workers = opt.num_workers

    lr_sch = lr_scheduler.CosineScheduler((50000//batch_size)*opt.num_epochs,
                                          base_lr=opt.lr,
                                          warmup_steps=5*(50000//batch_size),
                                          final_lr=1e-5)
    # lr_sch = lr_scheduler.FactorScheduler((50000//batch_size)*20,
    #                                       factor=0.2, base_lr=opt.lr,
    #                                       warmup_steps=5*(50000//batch_size))
    # lr_sch = LRScheduler('cosine',opt.lr, niters=(50000//batch_size)*opt.num_epochs,)

    model_name = opt.model
    net = SKT_Lite()
    # if model_name.startswith('cifar_wideresnet'):
    #     kwargs = {'classes': classes,
    #             'drop_rate': opt.drop_rate}
    # else:
    #     kwargs = {'classes': classes}
    # net = get_model(model_name, **kwargs)
    if opt.mixup:
        model_name += '_mixup'
    if opt.amp:
        model_name += '_amp'

    makedirs('./'+model_name)
    os.chdir('./'+model_name)
    sw = SummaryWriter(
        logdir='.\\tb\\'+model_name, flush_secs=5, verbose=False)
    makedirs(opt.save_plot_dir)

    if opt.resume_from:
        net.load_parameters(opt.resume_from, ctx=context)
    optimizer = 'nag'

    save_period = opt.save_period
    if opt.save_dir and save_period:
        save_dir = opt.save_dir
        makedirs(save_dir)
    else:
        save_dir = ''
        save_period = 0

    plot_name = opt.save_plot_dir

    logging_handlers = [logging.StreamHandler()]
    if opt.logging_dir:
        logging_dir = opt.logging_dir
        makedirs(logging_dir)
        logging_handlers.append(logging.FileHandler(
            '%s/train_cifar10_%s.log' % (logging_dir, model_name)))

    logging.basicConfig(level=logging.INFO, handlers=logging_handlers)
    logging.info(opt)

    if opt.amp:
        amp.init()

    if opt.profile_mode:
        profiler.set_config(profile_all=True,
                            aggregate_stats=True,
                            continuous_dump=True,
                            filename='%s_profile.json' % model_name)

    transform_train = transforms.Compose([
        gcv_transforms.RandomCrop(32, pad=4),
        CutOut(8),
        # gcv_transforms.block.RandomErasing(s_max=0.25),
        transforms.RandomFlipLeftRight(),
        # transforms.RandomFlipTopBottom(),
        transforms.Resize(32),
        transforms.ToTensor(),
        transforms.Normalize([0.4914, 0.4822, 0.4465],
                             [0.2023, 0.1994, 0.2010])
    ])

    transform_test = transforms.Compose([
        transforms.Resize(32),
        transforms.ToTensor(),
        transforms.Normalize([0.4914, 0.4822, 0.4465],
                             [0.2023, 0.1994, 0.2010])
    ])

    def label_transform(label, classes):
        ind = label.astype('int')
        res = nd.zeros((ind.shape[0], classes), ctx=label.context)
        res[nd.arange(ind.shape[0], ctx=label.context), ind] = 1
        return res

    def test(ctx, val_data):
        metric = mx.metric.Accuracy()
        loss_fn = gluon.loss.SoftmaxCrossEntropyLoss()
        num_batch = len(val_data)
        test_loss = 0
        for i, batch in enumerate(val_data):
            data = gluon.utils.split_and_load(
                batch[0], ctx_list=ctx, batch_axis=0)
            label = gluon.utils.split_and_load(
                batch[1], ctx_list=ctx, batch_axis=0)
            outputs = [net(X) for X in data]
            loss = [loss_fn(yhat, y) for yhat, y in zip(outputs, label)]
            metric.update(label, outputs)
            test_loss += sum([l.sum().asscalar() for l in loss])
        test_loss /= batch_size * num_batch
        name, val_acc = metric.get()
        return name, val_acc, test_loss

    def train(epochs, ctx):
        if isinstance(ctx, mx.Context):
            ctx = [ctx]
        net.initialize(mx.init.MSRAPrelu(), ctx=ctx)

        root = os.path.join('..', 'datasets', 'cifar-10')
        train_data = gluon.data.DataLoader(
            gluon.data.vision.CIFAR10(
                root=root, train=True).transform_first(transform_train),
            batch_size=batch_size, shuffle=True, last_batch='discard', num_workers=num_workers)

        val_data = gluon.data.DataLoader(
            gluon.data.vision.CIFAR10(
                root=root, train=False).transform_first(transform_test),
            batch_size=batch_size, shuffle=False, num_workers=num_workers)

        trainer = gluon.Trainer(net.collect_params(), optimizer,
                                {'learning_rate': opt.lr, 'wd': opt.wd,
                                 'momentum': opt.momentum, 'lr_scheduler': lr_sch})
        if opt.amp:
            amp.init_trainer(trainer)
        metric = mx.metric.Accuracy()
        train_metric = mx.metric.RMSE()
        loss_fn = gluon.loss.SoftmaxCrossEntropyLoss(
            sparse_label=False if opt.mixup else True)
        train_history = TrainingHistory(['training-error', 'validation-error'])
        # acc_history = TrainingHistory(['training-acc', 'validation-acc'])
        loss_history = TrainingHistory(['training-loss', 'validation-loss'])

        iteration = 0

        best_val_score = 0

        for epoch in range(epochs):
            tic = time.time()
            train_metric.reset()
            metric.reset()
            train_loss = 0
            num_batch = len(train_data)
            alpha = 1

            for i, batch in enumerate(train_data):
                if epoch == 0 and iteration == 1 and opt.profile_mode:
                    profiler.set_state('run')
                lam = np.random.beta(alpha, alpha)
                if epoch >= epochs - 20 or not opt.mixup:
                    lam = 1

                data_1 = gluon.utils.split_and_load(
                    batch[0], ctx_list=ctx, batch_axis=0)
                label_1 = gluon.utils.split_and_load(
                    batch[1], ctx_list=ctx, batch_axis=0)

                if not opt.mixup:
                    data = data_1
                    label = label_1
                else:
                    data = [lam*X + (1-lam)*X[::-1] for X in data_1]
                    label = []
                    for Y in label_1:
                        y1 = label_transform(Y, classes)
                        y2 = label_transform(Y[::-1], classes)
                        label.append(lam*y1 + (1-lam)*y2)

                with ag.record():
                    output = [net(X) for X in data]
                    loss = [loss_fn(yhat, y) for yhat, y in zip(output, label)]
                if opt.amp:
                    with ag.record():
                        with amp.scale_loss(loss, trainer) as scaled_loss:
                            ag.backward(scaled_loss)
                            # scaled_loss.backward()
                else:
                    for l in loss:
                        l.backward()
                trainer.step(batch_size)
                train_loss += sum([l.sum().asscalar() for l in loss])

                output_softmax = [nd.SoftmaxActivation(out) for out in output]
                train_metric.update(label, output_softmax)
                metric.update(label_1, output_softmax)
                name, acc = train_metric.get()
                sw.add_scalar(tag='lr', value=trainer.learning_rate,
                              global_step=iteration)
                if epoch == 0 and iteration == 1 and opt.profile_mode:
                    nd.waitall()
                    profiler.set_state('stop')
                iteration += 1

            train_loss /= batch_size * num_batch
            name, acc = train_metric.get()
            _, train_acc = metric.get()
            name, val_acc, _ = test(ctx, val_data)
            if opt.mixup:
                train_history.update([acc, 1-val_acc])
                plt.cla()
                train_history.plot(save_path='%s/%s_history.png' %
                                   (plot_name, model_name))
            else:
                train_history.update([1-train_acc, 1-val_acc])
                plt.cla()
                train_history.plot(save_path='%s/%s_history.png' %
                                   (plot_name, model_name))
            # acc_history.update([train_acc, val_acc])
            # plt.cla()
            # acc_history.plot(save_path='%s/%s_acc.png' %
            #                  (plot_name, model_name), legend_loc='best')

            if val_acc > best_val_score:
                best_val_score = val_acc
                net.save_parameters('%s/%.4f-cifar-%s-%d-best.params' %
                                    (save_dir, best_val_score, model_name, epoch))

            current_lr = trainer.learning_rate
            name, val_acc, val_loss = test(ctx, val_data)
            loss_history.update([train_loss, val_loss])
            plt.cla()
            loss_history.plot(save_path='%s/%s_loss.png' %
                              (plot_name, model_name), y_lim=(0, 2), legend_loc='best')
            logging.info('[Epoch %d] loss=%f train_acc=%f train_RMSE=%f\n     val_acc=%f val_loss=%f lr=%f time: %f' %
                         (epoch, train_loss, train_acc, acc, val_acc, val_loss, current_lr, time.time()-tic))
            sw._add_scalars(tag='Acc',
                            scalar_dict={'train_acc': train_acc, 'test_acc': val_acc}, global_step=epoch)
            sw._add_scalars(tag='Loss',
                            scalar_dict={'train_loss': train_loss, 'test_loss': val_loss}, global_step=epoch)
            if save_period and save_dir and (epoch + 1) % save_period == 0:
                net.save_parameters('%s/cifar10-%s-%d.params' %
                                    (save_dir, model_name, epoch))
        if save_period and save_dir:
            net.save_parameters('%s/cifar10-%s-%d.params' %
                                (save_dir, model_name, epochs-1))

    if opt.mode == 'hybrid':
        net.hybridize()
    train(opt.num_epochs, context)
    if opt.profile_mode:
        profiler.dump(finished=False)
    sw.close()
def main(async_executor=None):
    # Setup MLPerf logger
    mllog.config()
    mllogger = mllog.get_mllogger()
    mllogger.logger.propagate = False
    # Start MLPerf benchmark
    log_start(key=mlperf_constants.INIT_START, uniq=False)

    # Parse args
    args = parse_args()

    ############################################################################
    # Initialize various libraries (horovod, logger, amp ...)
    ############################################################################
    # Initialize async executor
    if args.async_val:
        assert async_executor is not None, 'Please use ssd_main_async.py to launch with async support'
    else:
        # (Force) disable async validation
        async_executor = None

    # Initialize horovod
    hvd.init()

    # Initialize AMP
    if args.precision == 'amp':
        amp.init(layout_optimization=True)

    # Set MXNET_SAFE_ACCUMULATION=1 if necessary
    if args.precision == 'fp16':
        os.environ["MXNET_SAFE_ACCUMULATION"] = "1"

    # Results folder
    network_name = f'ssd_{args.backbone}_{args.data_layout}_{args.dataset}_{args.data_shape}'
    save_prefix = None
    if args.results:
        save_prefix = os.path.join(args.results, network_name)
    else:
        logging.info(
            "No results folder was provided. The script will not write logs or save weight to disk"
        )

    # Initialize logger
    log_file = None
    if args.results:
        log_file = f'{save_prefix}_{args.mode}_{hvd.rank()}.log'
    setup_logger(level=args.log_level
                 if hvd.local_rank() in args.log_local_ranks else 'CRITICAL',
                 log_file=log_file)

    # Set seed
    args.seed = set_seed_distributed(args.seed)
    ############################################################################

    ############################################################################
    # Validate arguments and print some useful information
    ############################################################################
    logging.info(args)

    assert not (args.resume_from and args.pretrained_backbone), (
        "--resume-from and --pretrained_backbone are "
        "mutually exclusive.")
    assert args.data_shape == 300, "only data_shape=300 is supported at the moment."
    assert args.input_batch_multiplier >= 1, "input_batch_multiplier must be >= 1"
    assert not (hvd.size() == 1 and args.gradient_predivide_factor > 1), (
        "Gradient predivide factor is not supported "
        "with a single GPU")
    if args.data_layout == 'NCHW' or args.precision == 'fp32':
        assert args.bn_group == 1, "Group batch norm doesn't support FP32 data format or NCHW data layout."
        if not args.no_fuse_bn_relu:
            logging.warning((
                "WARNING: fused batch norm relu is only supported with NHWC layout. "
                "A non fused version will be forced."))
            args.no_fuse_bn_relu = True
        if not args.no_fuse_bn_add_relu:
            logging.warning((
                "WARNING: fused batch norm add relu is only supported with NHWC layout. "
                "A non fused version will be forced."))
            args.no_fuse_bn_add_relu = True
    if args.profile_no_horovod and hvd.size() > 1:
        logging.warning(
            "WARNING: hvd.size() > 1, so must IGNORE requested --profile-no-horovod"
        )
        args.profile_no_horovod = False

    logging.info(f'Seed: {args.seed}')
    logging.info(f'precision: {args.precision}')
    if args.precision == 'fp16':
        logging.info(f'loss scaling: {args.fp16_loss_scale}')
    logging.info(f'network name: {network_name}')
    logging.info(f'fuse bn relu: {not args.no_fuse_bn_relu}')
    logging.info(f'fuse bn add relu: {not args.no_fuse_bn_add_relu}')
    logging.info(f'bn group: {args.bn_group}')
    logging.info(f'bn all reduce fp16: {args.bn_fp16}')
    logging.info(f'MPI size: {hvd.size()}')
    logging.info(f'MPI global rank: {hvd.rank()}')
    logging.info(f'MPI local rank: {hvd.local_rank()}')
    logging.info(f'async validation: {args.async_val}')
    ############################################################################

    # TODO(ahmadki): load network and anchors based on args.backbone (JoC)
    # Load network
    net = ssd_300_resnet34_v1_mlperf_coco(
        pretrained_base=False,
        nms_overlap_thresh=args.nms_overlap_thresh,
        nms_topk=args.nms_topk,
        nms_valid_thresh=args.nms_valid_thresh,
        post_nms=args.post_nms,
        layout=args.data_layout,
        fuse_bn_add_relu=not args.no_fuse_bn_add_relu,
        fuse_bn_relu=not args.no_fuse_bn_relu,
        bn_fp16=args.bn_fp16,
        norm_kwargs={'bn_group': args.bn_group})

    # precomputed anchors
    anchors_np = mlperf_xywh_anchors(image_size=args.data_shape,
                                     clip=True,
                                     normalize=True)
    if args.test_anchors and hvd.rank() == 0:
        logging.info(f'Normalized anchors: {anchors_np}')

    # Training mode
    train_net = None
    train_pipeline = None
    trainer_fn = None
    lr_scheduler = None
    if args.mode in ['train', 'train_val']:
        # Training iterator
        num_cropping_iterations = 1
        if args.use_tfrecord:
            tfrecord_files = glob.glob(
                os.path.join(args.tfrecord_root, 'train.*.tfrecord'))
            index_files = glob.glob(
                os.path.join(args.tfrecord_root, 'train.*.idx'))
            tfrecords = [(tfrecod, index)
                         for tfrecod, index in zip(tfrecord_files, index_files)
                         ]
        train_pipeline = get_training_pipeline(
            coco_root=args.coco_root if not args.use_tfrecord else None,
            tfrecords=tfrecords if args.use_tfrecord else None,
            anchors=anchors_np,
            num_shards=hvd.size(),
            shard_id=hvd.rank(),
            device_id=hvd.local_rank(),
            batch_size=args.batch_size * args.input_batch_multiplier,
            dataset_size=args.dataset_size,
            data_layout=args.data_layout,
            data_shape=args.data_shape,
            num_cropping_iterations=num_cropping_iterations,
            num_workers=args.dali_workers,
            fp16=args.precision == 'fp16',
            input_jpg_decode=args.input_jpg_decode,
            hw_decoder_load=args.hw_decoder_load,
            decoder_cache_size=min(
                (100 * 1024 + hvd.size() - 1) // hvd.size(), 12 *
                1024) if args.input_jpg_decode == 'cache' else 0,
            seed=args.seed)
        log_event(key=mlperf_constants.TRAIN_SAMPLES,
                  value=train_pipeline.epoch_size)
        log_event(key=mlperf_constants.MAX_SAMPLES,
                  value=num_cropping_iterations)

        # Training network
        train_net = SSDMultiBoxLoss(net=net,
                                    local_batch_size=args.batch_size,
                                    bulk_last_wgrad=args.bulk_last_wgrad)

        # Trainer function. SSDModel expects a function that takes 1 parameter - HybridBlock
        trainer_fn = functools.partial(
            sgd_trainer,
            learning_rate=args.lr,
            weight_decay=args.weight_decay,
            momentum=args.momentum,
            precision=args.precision,
            fp16_loss_scale=args.fp16_loss_scale,
            gradient_predivide_factor=args.gradient_predivide_factor,
            num_groups=args.horovod_num_groups,
            profile_no_horovod=args.profile_no_horovod)

        # Learning rate scheduler
        lr_scheduler = MLPerfLearningRateScheduler(
            learning_rate=args.lr,
            decay_factor=args.lr_decay_factor,
            decay_epochs=args.lr_decay_epochs,
            warmup_factor=args.lr_warmup_factor,
            warmup_epochs=args.lr_warmup_epochs,
            epoch_size=train_pipeline.epoch_size,
            global_batch_size=args.batch_size * hvd.size())

    # Validation mode
    infer_net = None
    val_iterator = None
    if args.mode in ['infer', 'val', 'train_val']:
        # Validation iterator
        tfrecord_files = glob.glob(
            os.path.join(args.tfrecord_root, 'val.*.tfrecord'))
        index_files = glob.glob(os.path.join(args.tfrecord_root, 'val.*.idx'))
        tfrecords = [(tfrecod, index)
                     for tfrecod, index in zip(tfrecord_files, index_files)]
        val_pipeline = get_inference_pipeline(
            coco_root=args.coco_root if not args.use_tfrecord else None,
            tfrecords=tfrecords if args.use_tfrecord else None,
            num_shards=hvd.size(),
            shard_id=hvd.rank(),
            device_id=hvd.local_rank(),
            batch_size=args.eval_batch_size,
            dataset_size=args.eval_dataset_size,
            data_layout=args.data_layout,
            data_shape=args.data_shape,
            num_workers=args.dali_workers,
            fp16=args.precision == 'fp16')
        log_event(key=mlperf_constants.EVAL_SAMPLES,
                  value=val_pipeline.epoch_size)

        # Inference network
        infer_net = COCOInference(net=net,
                                  ltrb=False,
                                  scale_bboxes=True,
                                  score_threshold=0.0)

        # annotations file
        cocoapi_annotation_file = os.path.join(
            args.coco_root, 'annotations', 'bbox_only_instances_val2017.json')

    # Prepare model
    model = SSDModel(net=net,
                     anchors_np=anchors_np,
                     precision=args.precision,
                     fp16_loss_scale=args.fp16_loss_scale,
                     train_net=train_net,
                     trainer_fn=trainer_fn,
                     lr_scheduler=lr_scheduler,
                     metric=mx.metric.Loss(),
                     infer_net=infer_net,
                     async_executor=async_executor,
                     save_prefix=save_prefix,
                     ctx=mx.gpu(hvd.local_rank()))

    # Do a training and validation runs on fake data.
    # this will set layers shape (needed before loading pre-trained backbone),
    # allocate tensors and and cache optimized graph.
    # Training dry run:
    logging.info('Running training dry runs')
    dummy_train_pipeline = get_training_pipeline(
        coco_root=None,
        tfrecords=[('dummy.tfrecord', 'dummy.idx')],
        anchors=anchors_np,
        num_shards=1,
        shard_id=0,
        device_id=hvd.local_rank(),
        batch_size=args.batch_size * args.input_batch_multiplier,
        dataset_size=None,
        data_layout=args.data_layout,
        data_shape=args.data_shape,
        num_workers=args.dali_workers,
        fp16=args.precision == 'fp16',
        seed=args.seed)
    dummy_train_iterator = get_training_iterator(pipeline=dummy_train_pipeline,
                                                 batch_size=args.batch_size)
    for images, box_targets, cls_targets in dummy_train_iterator:
        model.train_step(images=images,
                         box_targets=box_targets,
                         cls_targets=cls_targets)
    # Freeing memory is disabled due a bug in CUDA graphs
    # del dummy_train_pipeline
    # del dummy_train_iterator
    mx.ndarray.waitall()
    logging.info('Done')
    # Validation dry run:
    logging.info('Running inference dry runs')
    dummy_val_pipeline = get_inference_pipeline(
        coco_root=None,
        tfrecords=[('dummy.tfrecord', 'dummy.idx')],
        num_shards=1,
        shard_id=0,
        device_id=hvd.local_rank(),
        batch_size=args.eval_batch_size,
        dataset_size=None,
        data_layout=args.data_layout,
        data_shape=args.data_shape,
        num_workers=args.dali_workers,
        fp16=args.precision == 'fp16')
    dummy_val_iterator = get_inference_iterator(pipeline=dummy_val_pipeline)
    model.infer(data_iterator=dummy_val_iterator, log_interval=None)
    # Freeing memory is disabled due a bug in CUDA graphs
    # del dummy_val_pipeline
    # del dummy_val_iterator
    mx.ndarray.waitall()
    logging.info('Done')

    # re-initialize the model as a precaution in case the dry runs changed the parameters
    model.init_model(force_reinit=True)
    model.zero_grads()
    mx.ndarray.waitall()

    # load saved model or pretrained backbone
    if args.resume_from:
        model.load_parameters(filename=args.resume_from)
    elif args.pretrained_backbone:
        model.load_pretrain_backbone(picklefile_name=args.pretrained_backbone)

    # broadcast parameters
    model.broadcast_params()
    mx.ndarray.waitall()

    if args.test_initialization and hvd.rank() == 0:
        model.print_params_stats(net)

    log_end(key=mlperf_constants.INIT_STOP)

    # Main MLPerf loop (training+validation)
    mpiwrapper.barrier()
    log_start(key=mlperf_constants.RUN_START)
    mpiwrapper.barrier()
    # Real data iterators
    train_iterator = None
    val_iterator = None
    if train_pipeline:
        train_iterator = get_training_iterator(pipeline=train_pipeline,
                                               batch_size=args.batch_size,
                                               synthetic=args.synthetic)
    if val_pipeline:
        val_iterator = get_inference_iterator(pipeline=val_pipeline)
    model_map, epoch = model.train_val(train_iterator=train_iterator,
                                       start_epoch=args.start_epoch,
                                       end_epoch=args.epochs,
                                       val_iterator=val_iterator,
                                       val_interval=args.val_interval,
                                       val_epochs=args.val_epochs,
                                       annotation_file=cocoapi_annotation_file,
                                       target_map=args.target_map,
                                       train_log_interval=args.log_interval,
                                       val_log_interval=args.log_interval,
                                       save_interval=args.save_interval,
                                       cocoapi_threads=args.cocoapi_threads,
                                       profile_start=args.profile_start,
                                       profile_stop=args.profile_stop)
    status = 'success' if (model_map
                           and model_map >= args.target_map) else 'aborted'
    mx.ndarray.waitall()
    log_end(key=mlperf_constants.RUN_STOP, metadata={"status": status})

    logging.info(f'Rank {hvd.rank()} done. map={model_map} @ epoch={epoch}')
    mx.nd.waitall()
    hvd.shutdown()
Exemple #11
0
def train_text_classification(args, reporter=None):
    # Step 1: add scripts every function and python objects in the original training script except for the training function
    # at the beginning of the decorated function
    nlp = try_import_gluonnlp()
    logger = logging.getLogger(__name__)
    if args.verbose:
        logger.setLevel(logging.INFO)
        logger.info(args)
    batch_size = args.batch_size
    dev_batch_size = args.dev_batch_size
    lr = args.lr
    epsilon = args.epsilon
    accumulate = args.accumulate
    log_interval = args.log_interval * accumulate if accumulate else args.log_interval
    if accumulate:
        logger.info('Using gradient accumulation. Effective batch size = ' \
                     'batch_size * accumulate = %d', accumulate * batch_size)

    # random seed
    np.random.seed(args.seed)
    random.seed(args.seed)
    mx.random.seed(args.seed)

    # TODO support for multi-GPU
    ctx = [mx.gpu(i) for i in range(args.num_gpus)
           ][0] if args.num_gpus > 0 else [mx.cpu()][0]

    task = args.dataset
    # data type with mixed precision training
    if args.dtype == 'float16':
        try:
            from mxnet.contrib import amp  # pylint: disable=ungrouped-imports
            # monkey patch amp list since topk does not support fp16
            amp.lists.symbol.FP32_FUNCS.append('topk')
            amp.lists.symbol.FP16_FP32_FUNCS.remove('topk')
            amp.init()
        except ValueError:
            # topk is already in the FP32_FUNCS list
            amp.init()
        except ImportError:
            # amp is not available
            logger.info(
                'Mixed precision training with float16 requires MXNet >= '
                '1.5.0b20190627. Please consider upgrading your MXNet version.'
            )
            exit()

    # model and loss
    model_name = args.net
    dataset = args.pretrained_dataset

    use_roberta = 'roberta' in model_name
    get_model_params = {
        'name': model_name,
        'dataset_name': dataset,
        'pretrained': True,
        'ctx': ctx,
        'use_decoder': False,
        'use_classifier': False,
    }
    # RoBERTa does not contain parameters for sentence pair classification
    if not use_roberta:
        get_model_params['use_pooler'] = True

    bert, vocabulary = nlp.model.get_model(**get_model_params)
    model = get_network(bert, task.class_labels, use_roberta)
    #do_regression = not task.class_labels
    #if do_regression:
    #    num_classes = 1
    #    loss_function = gluon.loss.L2Loss()
    #else:
    #    num_classes = len(task.class_labels)
    #    loss_function = gluon.loss.SoftmaxCELoss()
    ## reuse the BERTClassifier class with num_classes=1 for regression
    #if use_roberta:
    #    model = RoBERTaClassifier(bert, dropout=0.0, num_classes=num_classes)
    #else:
    #    model = BERTClassifier(bert, dropout=0.1, num_classes=num_classes)
    # initialize classifier
    loss_function = gluon.loss.SoftmaxCELoss(
    ) if task.class_labels else gluon.loss.L2Loss()
    initializer = mx.init.Normal(0.02)
    model.classifier.initialize(init=initializer, ctx=ctx)

    model.hybridize(static_alloc=True)
    loss_function.hybridize(static_alloc=True)

    # data processing
    do_lower_case = 'uncased' in dataset
    if use_roberta:
        bert_tokenizer = nlp.data.GPT2BPETokenizer()
    else:
        bert_tokenizer = nlp.data.BERTTokenizer(vocabulary,
                                                lower=do_lower_case)

    # Get the loader.
    train_data, dev_data_list, num_train_examples, trans, test_trans = preprocess_data(
        bert_tokenizer, task, batch_size, dev_batch_size, args.max_len,
        vocabulary, True, args.num_workers)

    def log_train(batch_id, batch_num, metric, step_loss, log_interval,
                  epoch_id, learning_rate, tbar):
        """Generate and print out the log message for training. """
        metric_nm, metric_val = metric.get()
        if not isinstance(metric_nm, list):
            metric_nm, metric_val = [metric_nm], [metric_val]

        train_str = '[Epoch %d] loss=%.4f, lr=%.7f, metrics:' + \
                    ','.join([i + ':%.4f' for i in metric_nm])
        tbar.set_description(
            train_str %
            (epoch_id, step_loss / log_interval, learning_rate, *metric_val))

    def log_eval(batch_id, batch_num, metric, step_loss, log_interval, tbar):
        """Generate and print out the log message for inference. """
        metric_nm, metric_val = metric.get()
        if not isinstance(metric_nm, list):
            metric_nm, metric_val = [metric_nm], [metric_val]

        eval_str = 'loss=%.4f, metrics:' + \
                   ','.join([i + ':%.4f' for i in metric_nm])
        tbar.set_description(eval_str %
                             (step_loss / log_interval, *metric_val))

    def evaluate(loader_dev, metric, segment):
        """Evaluate the model on validation dataset."""
        metric.reset()
        step_loss = 0
        tbar = tqdm(loader_dev)
        for batch_id, seqs in enumerate(tbar):
            input_ids, valid_length, segment_ids, label = seqs
            input_ids = input_ids.as_in_context(ctx)
            valid_length = valid_length.as_in_context(ctx).astype('float32')
            label = label.as_in_context(ctx)
            if use_roberta:
                out = model(input_ids, valid_length)
            else:
                out = model(input_ids, segment_ids.as_in_context(ctx),
                            valid_length)
            ls = loss_function(out, label).mean()

            step_loss += ls.asscalar()
            metric.update([label], [out])

            if (batch_id + 1) % (args.log_interval) == 0:
                log_eval(batch_id, len(loader_dev), metric, step_loss,
                         args.log_interval, tbar)
                step_loss = 0

        metric_nm, metric_val = metric.get()
        if not isinstance(metric_nm, list):
            metric_nm, metric_val = [metric_nm], [metric_val]
        metric_str = 'validation metrics:' + ','.join(
            [i + ':%.4f' for i in metric_nm])
        logger.info(metric_str, *metric_val)

        mx.nd.waitall()
        return metric_nm, metric_val

    # Step 2: the training function in the original training script is added in the decorated function in autogluon for training.
    """Training function."""

    all_model_params = model.collect_params()
    optimizer_params = {'learning_rate': lr, 'epsilon': epsilon, 'wd': 0.01}
    trainer = gluon.Trainer(all_model_params,
                            'bertadam',
                            optimizer_params,
                            update_on_kvstore=False)
    if args.dtype == 'float16':
        amp.init_trainer(trainer)

    step_size = batch_size * accumulate if accumulate else batch_size
    num_train_steps = int(num_train_examples / step_size * args.epochs)
    warmup_ratio = args.warmup_ratio
    num_warmup_steps = int(num_train_steps * warmup_ratio)
    step_num = 0

    # Do not apply weight decay on LayerNorm and bias terms
    for _, v in model.collect_params('.*beta|.*gamma|.*bias').items():
        v.wd_mult = 0.0
    # Collect differentiable parameters
    params = [p for p in all_model_params.values() if p.grad_req != 'null']

    # Set grad_req if gradient accumulation is required
    if accumulate and accumulate > 1:
        for p in params:
            p.grad_req = 'add'
    # track best eval score
    metric_history = []
    best_metric = None
    patience = args.early_stop

    tic = time.time()
    for epoch_id in range(args.epochs):
        if args.early_stop and patience == 0:
            logger.info('Early stopping at epoch %d', epoch_id)
            break
        task.metric.reset()
        step_loss = 0
        tic = time.time()
        all_model_params.zero_grad()

        tbar = tqdm(train_data)
        for batch_id, seqs in enumerate(tbar):
            # learning rate schedule
            if step_num < num_warmup_steps:
                new_lr = lr * step_num / num_warmup_steps
            else:
                non_warmup_steps = step_num - num_warmup_steps
                offset = non_warmup_steps / (num_train_steps -
                                             num_warmup_steps)
                new_lr = lr - offset * lr
            trainer.set_learning_rate(new_lr)

            # forward and backward
            with mx.autograd.record():
                input_ids, valid_length, segment_ids, label = seqs
                input_ids = input_ids.as_in_context(ctx)
                valid_length = valid_length.as_in_context(ctx).astype(
                    'float32')
                label = label.as_in_context(ctx)
                if use_roberta:
                    out = model(input_ids, valid_length)
                else:
                    out = model(input_ids, segment_ids.as_in_context(ctx),
                                valid_length)
                ls = loss_function(out, label).mean()
                if args.dtype == 'float16':
                    with amp.scale_loss(ls, trainer) as scaled_loss:
                        mx.autograd.backward(scaled_loss)
                else:
                    ls.backward()

            # update
            if not accumulate or (batch_id + 1) % accumulate == 0:
                trainer.allreduce_grads()
                nlp.utils.clip_grad_global_norm(params, 1)
                trainer.update(accumulate if accumulate else 1)
                step_num += 1
                if accumulate and accumulate > 1:
                    # set grad to zero for gradient accumulation
                    all_model_params.zero_grad()

            step_loss += ls.asscalar()
            task.metric.update([label], [out])
            if (batch_id + 1) % (args.log_interval) == 0:
                log_train(batch_id, len(train_data), task.metric, step_loss,
                          args.log_interval, epoch_id, trainer.learning_rate,
                          tbar)
                step_loss = 0
        mx.nd.waitall()

        # inference on dev data
        for segment, dev_data in dev_data_list:
            metric_nm, metric_val = evaluate(dev_data, task.metric, segment)
            if best_metric is None or metric_val >= best_metric:
                best_metric = metric_val
                patience = args.early_stop
            else:
                if args.early_stop is not None:
                    patience -= 1
            metric_history.append((epoch_id, metric_nm, metric_val))

        if reporter is not None:
            # Note: epoch reported back must start with 1, not with 0
            reporter(epoch=epoch_id + 1, accuracy=metric_val[0])

    if args.final_fit:
        get_model_params.pop('ctx')
        return {
            'model_params': collect_params(model),
            'get_model_args': get_model_params,
            'class_labels': task.class_labels,
            'transform': trans,
            'test_transform': test_trans
        }
Exemple #12
0
def fit(args, model, data_loader):
    """
    train a model
    args : argparse returns
    model : the the neural network model
    data_loader : function that returns the train and val data iterators
    """

    start_time = time.time()

    report = Report(args.arch, len(args.gpus), sys.argv)

    # select gpu for horovod process
    if 'horovod' in args.kv_store:
        hvd.init()
        args.gpus = [args.gpus[hvd.local_rank()]]

    if args.amp:
        amp.init()

    if args.seed is not None:
        logging.info('Setting seeds to {}'.format(args.seed))
        random.seed(args.seed)
        np.random.seed(args.seed)
        mx.random.seed(args.seed)

    # kvstore
    if 'horovod' in args.kv_store:
        kv = None
        rank = hvd.rank()
        num_workers = hvd.size()
    else:
        kv = mx.kvstore.create(args.kv_store)
        rank = kv.rank
        num_workers = kv.num_workers

    if args.test_io:
        train, val = data_loader(args, kv)

        if args.test_io_mode == 'train':
            data_iter = train
        else:
            data_iter = val

        tic = time.time()
        for i, batch in enumerate(data_iter):
            if isinstance(batch, list):
                for b in batch:
                    for j in b.data:
                        j.wait_to_read()
            else:
                for j in batch.data:
                    j.wait_to_read()
            if (i + 1) % args.disp_batches == 0:
                logging.info('Batch [{}]\tSpeed: {:.2f} samples/sec'.format(
                    i, args.disp_batches * args.batch_size / (time.time() - tic)))
                tic = time.time()
        return

    if not load_model(args, model):
        # all initializers should be specified in the model definition.
        # if not, this will raise an error
        model.initialize(mx.init.Initializer())

    # devices for training
    devs = list(map(mx.gpu, args.gpus))
    model.collect_params().reset_ctx(devs)

    if args.mode == 'pred':
        logging.info('Infering image {}'.format(args.data_pred))
        model_pred(args, model, data.load_image(args, args.data_pred, devs[0]))
        return

    # learning rate
    lr_scheduler = get_lr_scheduler(args)

    optimizer_params = {
        'learning_rate': 0,
        'wd': args.wd,
        'multi_precision': True,
    }

    # Only a limited number of optimizers have 'momentum' property
    has_momentum = {'sgd', 'dcasgd', 'nag', 'signum', 'lbsgd'}
    if args.optimizer in has_momentum:
        optimizer_params['momentum'] = args.mom

    # evaluation metrices
    if not args.no_metrics:
        eval_metrics = ['accuracy']
        eval_metrics.append(mx.metric.create(
            'top_k_accuracy', top_k=5))
    else:
        eval_metrics = []

    train, val = data_loader(args, kv)
    train = BenchmarkingDataIter(train, args.benchmark_iters)
    if val is not None:
        val = BenchmarkingDataIter(val, args.benchmark_iters)

    if 'horovod' in args.kv_store:
        # Fetch and broadcast parameters
        params = model.collect_params()
        if params is not None:
            hvd.broadcast_parameters(params, root_rank=0)

    # run
    if args.mode in ['train_val', 'train']:
        model_fit(
            args,
            model,
            train,
            begin_epoch=args.begin_epoch,
            num_epoch=args.num_epochs,
            eval_data=val,
            eval_metric=eval_metrics,
            kvstore=args.kv_store,
            kv=kv,
            optimizer=args.optimizer,
            optimizer_params=optimizer_params,
            lr_scheduler=lr_scheduler,
            report=report,
            model_prefix=args.model_prefix,
            print_loss=not args.no_metrics,
        )
    elif args.mode == 'val':
        for epoch in range(args.num_epochs):  # loop for benchmarking
            score = model_score(args, model, val, eval_metrics, args.kv_store, report=report)
            for name, value in zip(*score):
                logging.info('Validation {:20}: {}'.format(name, value))
    else:
        raise ValueError('Wrong mode')

    mx.nd.waitall()

    report.set_total_duration(time.time() - start_time)
    if args.report:
        suffix = '-{}'.format(hvd.rank()) if 'horovod' in args.kv_store and hvd.rank() != 0 else ''
        report.save(args.report + suffix)

    logging.info('Experiment took: {} sec'.format(report.total_duration))
Exemple #13
0
def train_net(config):
    mx.random.seed(3)
    np.random.seed(3)

    if config.TRAIN.USE_FP16:
        from mxnet.contrib import amp
        amp.init()
    if config.use_hvd:
        import horovod.mxnet as hvd

    ctx_list = [mx.gpu(x) for x in config.gpus]
    from utils.blocks import FrozenBatchNorm2d
    neck = PyramidNeckFCOS(feature_dim=config.network.fpn_neck_feature_dim)
    backbone = build_backbone(config,
                              neck=neck,
                              norm_layer=FrozenBatchNorm2d,
                              **config.network.BACKBONE.kwargs)
    net = FCOSFPNNet(backbone, config.dataset.NUM_CLASSES)

    # Resume parameters.
    resume = None
    if resume is not None:
        params_coco = mx.nd.load(resume)
        for k in params_coco:
            params_coco[k.replace("arg:", "").replace("aux:",
                                                      "")] = params_coco.pop(k)
        params = net.collect_params()

        for k in params.keys():
            try:
                params[k]._load_init(params_coco[k.replace('resnet0_', '')],
                                     ctx=mx.cpu())
                print("success load {}".format(k))
            except Exception as e:
                logging.exception(e)

    if config.TRAIN.resume is not None:
        net.collect_params().load(config.TRAIN.resume)
        logging.info("loaded resume from {}".format(config.TRAIN.resume))

    # Initialize parameters
    params = net.collect_params()
    from utils.initializer import KaMingUniform
    for key in params.keys():
        if params[key]._data is None:
            default_init = mx.init.Zero(
            ) if "bias" in key or "offset" in key else KaMingUniform()
            default_init.set_verbosity(True)
            if params[key].init is not None and hasattr(
                    params[key].init, "set_verbosity"):
                params[key].init.set_verbosity(True)
                params[key].initialize(init=params[key].init,
                                       default_init=params[key].init)
            else:
                params[key].initialize(default_init=default_init)
    params = net.collect_params()
    # for p_name, p in params.items():
    #     if p_name.endswith(('_bias')):
    #         p.wd_mult = 0
    #         p.lr_mult = 2
    #         logging.info("set wd_mult of {} to {}.".format(p_name, p.wd_mult))
    #         logging.info("set lr_mult of {} to {}.".format(p_name, p.lr_mult))

    net.collect_params().reset_ctx(list(set(ctx_list)))

    if config.dataset.dataset_type == "coco":
        from data.bbox.mscoco import COCODetection
        base_train_dataset = COCODetection(root=config.dataset.dataset_path,
                                           splits=("instances_train2017", ),
                                           h_flip=config.TRAIN.FLIP,
                                           transform=None,
                                           use_crowd=False)
    elif config.dataset.dataset_type == "voc":
        from data.bbox.voc import VOCDetection
        base_train_dataset = VOCDetection(root=config.dataset.dataset_path,
                                          splits=((2007, 'trainval'),
                                                  (2012, 'trainval')),
                                          preload_label=False)
    else:
        assert False
    train_dataset = AspectGroupingDataset(
        base_train_dataset,
        config,
        target_generator=FCOSTargetGenerator(config))

    if config.use_hvd:

        class SplitDataset(object):
            def __init__(self, da, local_size, local_rank):
                self.da = da
                self.local_size = local_size
                self.locak_rank = local_rank

            def __len__(self):
                return len(self.da) // self.local_size

            def __getitem__(self, idx):
                return self.da[idx * self.local_size + self.locak_rank]

        train_dataset = SplitDataset(train_dataset,
                                     local_size=hvd.local_size(),
                                     local_rank=hvd.local_rank())

    train_loader = mx.gluon.data.DataLoader(dataset=train_dataset,
                                            batch_size=1,
                                            num_workers=8,
                                            last_batch="discard",
                                            shuffle=True,
                                            thread_pool=False,
                                            batchify_fn=batch_fn)

    params_all = net.collect_params()
    params_to_train = {}
    params_fixed_prefix = config.network.FIXED_PARAMS
    for p in params_all.keys():
        ignore = False
        if params_all[p].grad_req == "null" and "running" not in p:
            ignore = True
            logging.info(
                "ignore {} because its grad req is set to null.".format(p))
        if params_fixed_prefix is not None:
            import re
            for f in params_fixed_prefix:
                if re.match(f, str(p)) is not None:
                    ignore = True
                    params_all[p].grad_req = 'null'
                    logging.info(
                        "{} is ignored when training because it matches {}.".
                        format(p, f))
        if not ignore and params_all[p].grad_req != "null":
            params_to_train[p] = params_all[p]
    lr_steps = [len(train_loader) * int(x) for x in config.TRAIN.lr_step]
    logging.info(lr_steps)
    lr_scheduler = mx.lr_scheduler.MultiFactorScheduler(
        step=lr_steps,
        warmup_mode="constant",
        factor=.1,
        base_lr=config.TRAIN.lr,
        warmup_steps=config.TRAIN.warmup_step,
        warmup_begin_lr=config.TRAIN.warmup_lr)
    if config.use_hvd:
        hvd.broadcast_parameters(net.collect_params(), root_rank=0)
        trainer = hvd.DistributedTrainer(
            params_to_train, 'sgd', {
                'wd': config.TRAIN.wd,
                'momentum': config.TRAIN.momentum,
                'clip_gradient': None,
                'lr_scheduler': lr_scheduler,
                'multi_precision': True,
            })
    else:
        trainer = mx.gluon.Trainer(
            params_to_train,  # fix batchnorm, fix first stage, etc...
            'sgd',
            {
                'wd': config.TRAIN.wd,
                'momentum': config.TRAIN.momentum,
                'clip_gradient': None,
                'lr_scheduler': lr_scheduler,
                'multi_precision': True,
            },
            update_on_kvstore=(False if config.TRAIN.USE_FP16 else None),
            kvstore=mx.kvstore.create('local'))
    if config.TRAIN.USE_FP16:
        amp.init_trainer(trainer)
    # trainer = mx.gluon.Trainer(
    #     params_to_train,  # fix batchnorm, fix first stage, etc...
    #     'adam', {"learning_rate": 4e-4})
    # Please note that the GPU devices of the trainer states when saving must be same with that when loading.
    if config.TRAIN.trainer_resume is not None:
        trainer.load_states(config.TRAIN.trainer_resume)
        logging.info("loaded trainer states from {}.".format(
            config.TRAIN.trainer_resume))

    metric_loss_loc = mx.metric.Loss(name="loss_loc")
    metric_loss_cls = mx.metric.Loss(name="loss_cls")
    metric_loss_center = mx.metric.Loss(name="loss_center")
    eval_metrics = mx.metric.CompositeEvalMetric()
    for child_metric in [metric_loss_loc, metric_loss_cls, metric_loss_center]:
        eval_metrics.add(child_metric)

    net.hybridize(static_alloc=True, static_shape=False)
    for ctx in ctx_list:
        with ag.record():
            pad = lambda x: int(np.ceil(x / 32) * 32)
            _ = net(
                mx.nd.random.randn(
                    config.TRAIN.batch_size // len(ctx_list),
                    int(pad(config.TRAIN.image_max_long_size + 32)),
                    int(pad(config.TRAIN.image_short_size + 32)),
                    3,
                    ctx=ctx))
        ag.backward(_)
        del _
        net.collect_params().zero_grad()
    mx.nd.waitall()

    while trainer.optimizer.num_update <= config.TRAIN.end_epoch * len(
            train_loader):
        epoch = trainer.optimizer.num_update // len(train_loader)
        for data_batch in tqdm.tqdm(
                train_loader
        ) if not config.use_hvd or hvd.local_rank() == 0 else train_loader:
            if config.use_hvd:
                data_list = [data_batch[0].as_in_context(ctx_list[0])]
                targets_list = [data_batch[1].as_in_context(ctx_list[0])]
            else:
                if isinstance(data_batch[0], mx.nd.NDArray):
                    data_list = mx.gluon.utils.split_and_load(
                        mx.nd.array(data_batch[0]),
                        ctx_list=ctx_list,
                        batch_axis=0)
                    targets_list = mx.gluon.utils.split_and_load(
                        mx.nd.array(data_batch[1]),
                        ctx_list=ctx_list,
                        batch_axis=0)
                else:
                    data_list = mx.gluon.utils.split_and_load(
                        mx.nd.array(data_batch[0][0]),
                        ctx_list=ctx_list,
                        batch_axis=0)
                    targets_list = mx.gluon.utils.split_and_load(
                        mx.nd.array(data_batch[0][1]),
                        ctx_list=ctx_list,
                        batch_axis=0)

            losses_loc = []
            losses_center_ness = []
            losses_cls = []

            n_workers = hvd.local_size() if config.use_hvd else len(ctx_list)
            num_pos = data_batch[0][1][:, 0].sum() / n_workers
            num_pos_denominator = mx.nd.maximum(num_pos,
                                                mx.nd.ones_like(num_pos))
            centerness_sum = data_batch[0][1][:, 5].sum() / n_workers
            centerness_sum_denominator = mx.nd.maximum(
                centerness_sum, mx.nd.ones_like(centerness_sum))

            with ag.record():
                for data, targets in zip(data_list, targets_list):
                    num_pos_denominator_ctx = num_pos_denominator.as_in_context(
                        data.context)
                    centerness_sum_denominator_ctx = centerness_sum_denominator.as_in_context(
                        data.context)
                    loc_preds, cls_preds = net(data)
                    iou_loss = mobula.op.IoULoss(loc_preds[:, :4],
                                                 targets[:, 1:5],
                                                 axis=1)
                    iou_loss = iou_loss * targets[:, 5:
                                                  6] / centerness_sum_denominator_ctx
                    # iou_loss = IoULoss()(loc_preds[:, :4].exp(), targets[:, 1:5]) * targets[:, 5] / centerness_sum_denominator_ctx
                    loss_center = mobula.op.BCELoss(
                        loc_preds[:, 4],
                        targets[:, 5]) * targets[:,
                                                 0] / num_pos_denominator_ctx
                    loss_cls = mobula.op.FocalLoss(
                        alpha=.25,
                        gamma=2,
                        logits=cls_preds,
                        targets=targets[:, 6:]) / num_pos_denominator_ctx
                    loss_total = loss_center.sum() + iou_loss.sum(
                    ) + loss_cls.sum()
                    if config.TRAIN.USE_FP16:
                        with amp.scale_loss(loss_total,
                                            trainer) as scaled_losses:
                            ag.backward(scaled_losses)
                    else:
                        loss_total.backward()
                    losses_loc.append(iou_loss)
                    losses_center_ness.append(loss_center)
                    losses_cls.append(loss_cls)

            trainer.step(n_workers)
            if not config.use_hvd or hvd.local_rank() == 0:
                for l in losses_loc:
                    metric_loss_loc.update(None, l.sum())
                for l in losses_center_ness:
                    metric_loss_center.update(None, l.sum())
                for l in losses_cls:
                    metric_loss_cls.update(None, l.sum())
                if trainer.optimizer.num_update % config.TRAIN.log_interval == 0:  #
                    msg = "Epoch={},Step={},lr={}, ".format(
                        epoch, trainer.optimizer.num_update,
                        trainer.learning_rate)
                    msg += ','.join([
                        '{}={:.3f}'.format(w, v)
                        for w, v in zip(*eval_metrics.get())
                    ])
                    logging.info(msg)
                    eval_metrics.reset()
                if trainer.optimizer.num_update % 5000 == 0:
                    save_path = os.path.join(
                        config.TRAIN.log_path,
                        "{}-{}.params".format(epoch,
                                              trainer.optimizer.num_update))
                    net.collect_params().save(save_path)
                    logging.info("Saved checkpoint to {}".format(save_path))
                    trainer_path = save_path + "-trainer.states"
                    trainer.save_states(trainer_path)

        if not config.use_hvd or hvd.local_rank() == 0:
            save_path = os.path.join(config.TRAIN.log_path,
                                     "{}.params".format(epoch))
            net.collect_params().save(save_path)
            logging.info("Saved checkpoint to {}".format(save_path))
            trainer_path = save_path + "-trainer.states"
            trainer.save_states(trainer_path)
Exemple #14
0
def run(mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225],
        graphviz=True,
        epoch=100,
        input_size=[512, 512],
        batch_size=16,
        batch_log=100,
        batch_interval=10,
        subdivision=4,
        train_dataset_path="Dataset/train",
        valid_dataset_path="Dataset/valid",
        multiscale=True,
        factor_scale=[8, 5],
        data_augmentation=True,
        num_workers=4,
        optimizer="ADAM",
        lambda_off=1,
        lambda_size=0.1,
        save_period=5,
        load_period=10,
        learning_rate=0.001,
        decay_lr=0.999,
        decay_step=10,
        GPU_COUNT=0,
        base=18,
        pretrained_base=True,
        pretrained_path="modelparam",
        AMP=True,
        valid_size=8,
        eval_period=5,
        tensorboard=True,
        valid_graph_path="valid_Graph",
        using_mlflow=True,
        topk=100,
        plot_class_thresh=0.5):
    '''
    AMP 가 모든 연산을 지원하지는 않는다.
    modulated convolution을 지원하지 않음
    '''
    if GPU_COUNT == 0:
        ctx = mx.cpu(0)
        AMP = False
    elif GPU_COUNT == 1:
        ctx = mx.gpu(0)
    else:
        ctx = [mx.gpu(i) for i in range(GPU_COUNT)]

    # 운영체제 확인
    if platform.system() == "Linux":
        logging.info(f"{platform.system()} OS")
    elif platform.system() == "Windows":
        logging.info(f"{platform.system()} OS")
    else:
        logging.info(f"{platform.system()} OS")

    if isinstance(ctx, (list, tuple)):
        for i, c in enumerate(ctx):
            free_memory, total_memory = mx.context.gpu_memory_info(i)
            free_memory = round(free_memory / (1024 * 1024 * 1024), 2)
            total_memory = round(total_memory / (1024 * 1024 * 1024), 2)
            logging.info(
                f'Running on {c} / free memory : {free_memory}GB / total memory {total_memory}GB'
            )
    else:
        if GPU_COUNT == 1:
            free_memory, total_memory = mx.context.gpu_memory_info(0)
            free_memory = round(free_memory / (1024 * 1024 * 1024), 2)
            total_memory = round(total_memory / (1024 * 1024 * 1024), 2)
            logging.info(
                f'Running on {ctx} / free memory : {free_memory}GB / total memory {total_memory}GB'
            )
        else:
            logging.info(f'Running on {ctx}')

    if GPU_COUNT > 0 and batch_size < GPU_COUNT:
        logging.info("batch size must be greater than gpu number")
        exit(0)

    if AMP:
        amp.init()

    if multiscale:
        logging.info("Using MultiScale")

    if data_augmentation:
        logging.info("Using Data Augmentation")

    logging.info("training Center Detector")
    input_shape = (1, 3) + tuple(input_size)

    scale_factor = 4  # 고정
    logging.info(f"scale factor {scale_factor}")

    try:
        train_dataloader, train_dataset = traindataloader(
            multiscale=multiscale,
            factor_scale=factor_scale,
            augmentation=data_augmentation,
            path=train_dataset_path,
            input_size=input_size,
            batch_size=batch_size,
            batch_interval=batch_interval,
            num_workers=num_workers,
            shuffle=True,
            mean=mean,
            std=std,
            scale_factor=scale_factor,
            make_target=True)
        valid_dataloader, valid_dataset = validdataloader(
            path=valid_dataset_path,
            input_size=input_size,
            batch_size=valid_size,
            num_workers=num_workers,
            shuffle=True,
            mean=mean,
            std=std,
            scale_factor=scale_factor,
            make_target=True)

    except Exception as E:
        logging.info(E)
        exit(0)

    train_update_number_per_epoch = len(train_dataloader)
    if train_update_number_per_epoch < 1:
        logging.warning("train batch size가 데이터 수보다 큼")
        exit(0)

    valid_list = glob.glob(os.path.join(valid_dataset_path, "*"))
    if valid_list:
        valid_update_number_per_epoch = len(valid_dataloader)
        if valid_update_number_per_epoch < 1:
            logging.warning("valid batch size가 데이터 수보다 큼")
            exit(0)

    num_classes = train_dataset.num_class  # 클래스 수
    name_classes = train_dataset.classes

    optimizer = optimizer.upper()
    if pretrained_base:
        model = str(input_size[0]) + "_" + str(
            input_size[1]) + "_" + optimizer + "_P" + "CENTER_RES" + str(base)
    else:
        model = str(input_size[0]) + "_" + str(
            input_size[1]) + "_" + optimizer + "_CENTER_RES" + str(base)

    weight_path = f"weights/{model}"
    sym_path = os.path.join(weight_path, f'{model}-symbol.json')
    param_path = os.path.join(weight_path, f'{model}-{load_period:04d}.params')

    if os.path.exists(param_path) and os.path.exists(sym_path):
        start_epoch = load_period
        logging.info(f"loading {os.path.basename(param_path)} weights\n")
        net = gluon.SymbolBlock.imports(sym_path, ['data'],
                                        param_path,
                                        ctx=ctx)
    else:
        start_epoch = 0
        net = CenterNet(base=base,
                        heads=OrderedDict([('heatmap', {
                            'num_output': num_classes,
                            'bias': -2.19
                        }), ('offset', {
                            'num_output': 2
                        }), ('wh', {
                            'num_output': 2
                        })]),
                        head_conv_channel=64,
                        pretrained=pretrained_base,
                        root=pretrained_path,
                        use_dcnv2=False,
                        ctx=ctx)

        if isinstance(ctx, (list, tuple)):
            net.summary(mx.nd.ones(shape=input_shape, ctx=ctx[0]))
        else:
            net.summary(mx.nd.ones(shape=input_shape, ctx=ctx))
        '''
        active (bool, default True) – Whether to turn hybrid on or off.
        static_alloc (bool, default False) – Statically allocate memory to improve speed. Memory usage may increase.
        static_shape (bool, default False) – Optimize for invariant input shapes between iterations. Must also set static_alloc to True. Change of input shapes is still allowed but slower.
        '''
        if multiscale:
            net.hybridize(active=True, static_alloc=True, static_shape=False)
        else:
            net.hybridize(active=True, static_alloc=True, static_shape=True)

    if start_epoch + 1 >= epoch + 1:
        logging.info("this model has already been optimized")
        exit(0)

    if tensorboard:
        summary = SummaryWriter(logdir=os.path.join("mxboard", model),
                                max_queue=10,
                                flush_secs=10,
                                verbose=False)
        if isinstance(ctx, (list, tuple)):
            net.forward(mx.nd.ones(shape=input_shape, ctx=ctx[0]))
        else:
            net.forward(mx.nd.ones(shape=input_shape, ctx=ctx))
        summary.add_graph(net)
    if graphviz:
        gluoncv.utils.viz.plot_network(net,
                                       shape=input_shape,
                                       save_prefix=model)

    # optimizer
    unit = 1 if (len(train_dataset) //
                 batch_size) < 1 else len(train_dataset) // batch_size
    step = unit * decay_step
    lr_sch = mx.lr_scheduler.FactorScheduler(step=step,
                                             factor=decay_lr,
                                             stop_factor_lr=1e-12,
                                             base_lr=learning_rate)

    for p in net.collect_params().values():
        if p.grad_req != "null":
            p.grad_req = 'add'

    if AMP:
        '''
        update_on_kvstore : bool, default None
        Whether to perform parameter updates on kvstore. If None, then trainer will choose the more
        suitable option depending on the type of kvstore. If the `update_on_kvstore` argument is
        provided, environment variable `MXNET_UPDATE_ON_KVSTORE` will be ignored.
        '''
        if optimizer.upper() == "ADAM":
            trainer = gluon.Trainer(
                net.collect_params(),
                optimizer,
                optimizer_params={
                    "learning_rate": learning_rate,
                    "lr_scheduler": lr_sch,
                    "beta1": 0.9,
                    "beta2": 0.999,
                    'multi_precision': False
                },
                update_on_kvstore=False)  # for Dynamic loss scaling
        elif optimizer.upper() == "RMSPROP":
            trainer = gluon.Trainer(
                net.collect_params(),
                optimizer,
                optimizer_params={
                    "learning_rate": learning_rate,
                    "lr_scheduler": lr_sch,
                    "gamma1": 0.9,
                    "gamma2": 0.999,
                    'multi_precision': False
                },
                update_on_kvstore=False)  # for Dynamic loss scaling
        elif optimizer.upper() == "SGD":
            trainer = gluon.Trainer(
                net.collect_params(),
                optimizer,
                optimizer_params={
                    "learning_rate": learning_rate,
                    "lr_scheduler": lr_sch,
                    "wd": 0.0001,
                    "momentum": 0.9,
                    'multi_precision': False
                },
                update_on_kvstore=False)  # for Dynamic loss scaling
        else:
            logging.error("optimizer not selected")
            exit(0)

        amp.init_trainer(trainer)

    else:
        if optimizer.upper() == "ADAM":
            trainer = gluon.Trainer(net.collect_params(),
                                    optimizer,
                                    optimizer_params={
                                        "learning_rate": learning_rate,
                                        "lr_scheduler": lr_sch,
                                        "beta1": 0.9,
                                        "beta2": 0.999,
                                        'multi_precision': False
                                    })
        elif optimizer.upper() == "RMSPROP":
            trainer = gluon.Trainer(net.collect_params(),
                                    optimizer,
                                    optimizer_params={
                                        "learning_rate": learning_rate,
                                        "lr_scheduler": lr_sch,
                                        "gamma1": 0.9,
                                        "gamma2": 0.999,
                                        'multi_precision': False
                                    })
        elif optimizer.upper() == "SGD":
            trainer = gluon.Trainer(net.collect_params(),
                                    optimizer,
                                    optimizer_params={
                                        "learning_rate": learning_rate,
                                        "lr_scheduler": lr_sch,
                                        "wd": 0.0001,
                                        "momentum": 0.9,
                                        'multi_precision': False
                                    })

        else:
            logging.error("optimizer not selected")
            exit(0)

    heatmapfocalloss = HeatmapFocalLoss(from_sigmoid=True, alpha=2, beta=4)
    normedl1loss = NormedL1Loss()
    prediction = Prediction(batch_size=valid_size,
                            topk=topk,
                            scale=scale_factor)
    precision_recall = Voc_2007_AP(iou_thresh=0.5, class_names=name_classes)

    start_time = time.time()
    for i in tqdm(range(start_epoch + 1, epoch + 1, 1),
                  initial=start_epoch + 1,
                  total=epoch):

        heatmap_loss_sum = 0
        offset_loss_sum = 0
        wh_loss_sum = 0
        time_stamp = time.time()
        '''
        target generator를 train_dataloader에서 만들어 버리는게 학습 속도가 훨씬 빠르다. 
        '''

        for batch_count, (image, _, heatmap, offset_target, wh_target,
                          mask_target, _) in enumerate(train_dataloader,
                                                       start=1):
            td_batch_size = image.shape[0]

            image_split = mx.nd.split(data=image,
                                      num_outputs=subdivision,
                                      axis=0)
            heatmap_split = mx.nd.split(data=heatmap,
                                        num_outputs=subdivision,
                                        axis=0)
            offset_target_split = mx.nd.split(data=offset_target,
                                              num_outputs=subdivision,
                                              axis=0)
            wh_target_split = mx.nd.split(data=wh_target,
                                          num_outputs=subdivision,
                                          axis=0)
            mask_target_split = mx.nd.split(data=mask_target,
                                            num_outputs=subdivision,
                                            axis=0)

            if subdivision == 1:
                image_split = [image_split]
                heatmap_split = [heatmap_split]
                offset_target_split = [offset_target_split]
                wh_target_split = [wh_target_split]
                mask_target_split = [mask_target_split]
            '''
            autograd 설명
            https://mxnet.apache.org/api/python/docs/tutorials/getting-started/crash-course/3-autograd.html
            '''
            with autograd.record(train_mode=True):

                heatmap_all_losses = []
                offset_all_losses = []
                wh_all_losses = []

                for image_part, heatmap_part, offset_target_part, wh_target_part, mask_target_part in zip(
                        image_split, heatmap_split, offset_target_split,
                        wh_target_split, mask_target_split):

                    if GPU_COUNT <= 1:
                        image_part = gluon.utils.split_and_load(
                            image_part, [ctx], even_split=False)
                        heatmap_part = gluon.utils.split_and_load(
                            heatmap_part, [ctx], even_split=False)
                        offset_target_part = gluon.utils.split_and_load(
                            offset_target_part, [ctx], even_split=False)
                        wh_target_part = gluon.utils.split_and_load(
                            wh_target_part, [ctx], even_split=False)
                        mask_target_part = gluon.utils.split_and_load(
                            mask_target_part, [ctx], even_split=False)
                    else:
                        image_part = gluon.utils.split_and_load(
                            image_part, ctx, even_split=False)
                        heatmap_part = gluon.utils.split_and_load(
                            heatmap_part, ctx, even_split=False)
                        offset_target_part = gluon.utils.split_and_load(
                            offset_target_part, ctx, even_split=False)
                        wh_target_part = gluon.utils.split_and_load(
                            wh_target_part, ctx, even_split=False)
                        mask_target_part = gluon.utils.split_and_load(
                            mask_target_part, ctx, even_split=False)

                    # prediction, target space for Data Parallelism
                    heatmap_losses = []
                    offset_losses = []
                    wh_losses = []
                    total_loss = []

                    # gpu N 개를 대비한 코드 (Data Parallelism)
                    for img, heatmap_target, offset_target, wh_target, mask_target in zip(
                            image_part, heatmap_part, offset_target_part,
                            wh_target_part, mask_target_part):
                        heatmap_pred, offset_pred, wh_pred = net(img)
                        heatmap_loss = heatmapfocalloss(
                            heatmap_pred, heatmap_target)
                        offset_loss = normedl1loss(offset_pred, offset_target,
                                                   mask_target) * lambda_off
                        wh_loss = normedl1loss(wh_pred, wh_target,
                                               mask_target) * lambda_size

                        heatmap_losses.append(heatmap_loss.asscalar())
                        offset_losses.append(offset_loss.asscalar())
                        wh_losses.append(wh_loss.asscalar())

                        total_loss.append(heatmap_loss + offset_loss + wh_loss)

                    if AMP:
                        with amp.scale_loss(total_loss,
                                            trainer) as scaled_loss:
                            autograd.backward(scaled_loss)
                    else:
                        autograd.backward(total_loss)

                    heatmap_all_losses.append(sum(heatmap_losses))
                    offset_all_losses.append(sum(offset_losses))
                    wh_all_losses.append(sum(wh_losses))

            trainer.step(batch_size=td_batch_size, ignore_stale_grad=False)
            # 비우기

            for p in net.collect_params().values():
                p.zero_grad()

            heatmap_loss_sum += sum(heatmap_all_losses) / td_batch_size
            offset_loss_sum += sum(offset_all_losses) / td_batch_size
            wh_loss_sum += sum(wh_all_losses) / td_batch_size

            if batch_count % batch_log == 0:
                logging.info(
                    f'[Epoch {i}][Batch {batch_count}/{train_update_number_per_epoch}],'
                    f'[Speed {td_batch_size / (time.time() - time_stamp):.3f} samples/sec],'
                    f'[Lr = {trainer.learning_rate}]'
                    f'[heatmap loss = {sum(heatmap_all_losses) / td_batch_size:.3f}]'
                    f'[offset loss = {sum(offset_all_losses) / td_batch_size:.3f}]'
                    f'[wh loss = {sum(wh_all_losses) / td_batch_size:.3f}]')
            time_stamp = time.time()

        train_heatmap_loss_mean = np.divide(heatmap_loss_sum,
                                            train_update_number_per_epoch)
        train_offset_loss_mean = np.divide(offset_loss_sum,
                                           train_update_number_per_epoch)
        train_wh_loss_mean = np.divide(wh_loss_sum,
                                       train_update_number_per_epoch)
        train_total_loss_mean = train_heatmap_loss_mean + train_offset_loss_mean + train_wh_loss_mean

        logging.info(
            f"train heatmap loss : {train_heatmap_loss_mean} / train offset loss : {train_offset_loss_mean} / train wh loss : {train_wh_loss_mean} / train total loss : {train_total_loss_mean}"
        )

        if i % eval_period == 0 and valid_list:

            heatmap_loss_sum = 0
            offset_loss_sum = 0
            wh_loss_sum = 0

            # loss 구하기
            for image, label, heatmap_all, offset_target_all, wh_target_all, mask_target_all, _ in valid_dataloader:
                vd_batch_size = image.shape[0]

                if GPU_COUNT <= 1:
                    image = gluon.utils.split_and_load(image, [ctx],
                                                       even_split=False)
                    label = gluon.utils.split_and_load(label, [ctx],
                                                       even_split=False)
                    heatmap_split = gluon.utils.split_and_load(
                        heatmap_all, [ctx], even_split=False)
                    offset_target_split = gluon.utils.split_and_load(
                        offset_target_all, [ctx], even_split=False)
                    wh_target_split = gluon.utils.split_and_load(
                        wh_target_all, [ctx], even_split=False)
                    mask_target_split = gluon.utils.split_and_load(
                        mask_target_all, [ctx], even_split=False)
                else:
                    image = gluon.utils.split_and_load(image,
                                                       ctx,
                                                       even_split=False)
                    label = gluon.utils.split_and_load(label,
                                                       ctx,
                                                       even_split=False)
                    heatmap_split = gluon.utils.split_and_load(
                        heatmap_all, ctx, even_split=False)
                    offset_target_split = gluon.utils.split_and_load(
                        offset_target_all, ctx, even_split=False)
                    wh_target_split = gluon.utils.split_and_load(
                        wh_target_all, ctx, even_split=False)
                    mask_target_split = gluon.utils.split_and_load(
                        mask_target_all, ctx, even_split=False)

                # prediction, target space for Data Parallelism
                heatmap_losses = []
                offset_losses = []
                wh_losses = []

                # gpu N 개를 대비한 코드 (Data Parallelism)
                for img, lb, heatmap_target, offset_target, wh_target, mask_target in zip(
                        image, label, heatmap_split, offset_target_split,
                        wh_target_split, mask_target_split):
                    gt_box = lb[:, :, :4]
                    gt_id = lb[:, :, 4:5]
                    heatmap_pred, offset_pred, wh_pred = net(img)

                    id, score, bbox = prediction(heatmap_pred, offset_pred,
                                                 wh_pred)
                    precision_recall.update(pred_bboxes=bbox,
                                            pred_labels=id,
                                            pred_scores=score,
                                            gt_boxes=gt_box * scale_factor,
                                            gt_labels=gt_id)

                    heatmap_loss = heatmapfocalloss(heatmap_pred,
                                                    heatmap_target)
                    offset_loss = normedl1loss(offset_pred, offset_target,
                                               mask_target) * lambda_off
                    wh_loss = normedl1loss(wh_pred, wh_target,
                                           mask_target) * lambda_size

                    heatmap_losses.append(heatmap_loss.asscalar())
                    offset_losses.append(offset_loss.asscalar())
                    wh_losses.append(wh_loss.asscalar())

                heatmap_loss_sum += sum(heatmap_losses) / vd_batch_size
                offset_loss_sum += sum(offset_losses) / vd_batch_size
                wh_loss_sum += sum(wh_losses) / vd_batch_size

            valid_heatmap_loss_mean = np.divide(heatmap_loss_sum,
                                                valid_update_number_per_epoch)
            valid_offset_loss_mean = np.divide(offset_loss_sum,
                                               valid_update_number_per_epoch)
            valid_wh_loss_mean = np.divide(wh_loss_sum,
                                           valid_update_number_per_epoch)
            valid_total_loss_mean = valid_heatmap_loss_mean + valid_offset_loss_mean + valid_wh_loss_mean

            logging.info(
                f"valid heatmap loss : {valid_heatmap_loss_mean} / valid offset loss : {valid_offset_loss_mean} / valid wh loss : {valid_wh_loss_mean} / valid total loss : {valid_total_loss_mean}"
            )

            AP_appender = []
            round_position = 2
            class_name, precision, recall, true_positive, false_positive, threshold = precision_recall.get_PR_list(
            )
            for j, c, p, r in zip(range(len(recall)), class_name, precision,
                                  recall):
                name, AP = precision_recall.get_AP(c, p, r)
                logging.info(
                    f"class {j}'s {name} AP : {round(AP * 100, round_position)}%"
                )
                AP_appender.append(AP)
            mAP_result = np.mean(AP_appender)

            logging.info(f"mAP : {round(mAP_result * 100, round_position)}%")
            precision_recall.get_PR_curve(name=class_name,
                                          precision=precision,
                                          recall=recall,
                                          threshold=threshold,
                                          AP=AP_appender,
                                          mAP=mAP_result,
                                          folder_name=valid_graph_path,
                                          epoch=i)
            precision_recall.reset()

            if tensorboard:
                # gpu N 개를 대비한 코드 (Data Parallelism)
                dataloader_iter = iter(valid_dataloader)
                image, label, _, _, _, _, _ = next(dataloader_iter)

                if GPU_COUNT <= 1:
                    image = gluon.utils.split_and_load(image, [ctx],
                                                       even_split=False)
                    label = gluon.utils.split_and_load(label, [ctx],
                                                       even_split=False)
                else:
                    image = gluon.utils.split_and_load(image,
                                                       ctx,
                                                       even_split=False)
                    label = gluon.utils.split_and_load(label,
                                                       ctx,
                                                       even_split=False)

                ground_truth_colors = {}
                for k in range(num_classes):
                    ground_truth_colors[k] = (0, 0, 1)

                batch_image = []
                heatmap_image = []
                for img, lb in zip(image, label):
                    gt_boxes = lb[:, :, :4]
                    gt_ids = lb[:, :, 4:5]
                    heatmap_pred, offset_pred, wh_pred = net(img)
                    ids, scores, bboxes = prediction(heatmap_pred, offset_pred,
                                                     wh_pred)

                    for ig, gt_id, gt_box, heatmap, id, score, bbox in zip(
                            img, gt_ids, gt_boxes, heatmap_pred, ids, scores,
                            bboxes):
                        ig = ig.transpose((1, 2, 0)) * mx.nd.array(
                            std, ctx=ig.context) + mx.nd.array(mean,
                                                               ctx=ig.context)
                        ig = (ig * 255).clip(0, 255)

                        # heatmap 그리기
                        heatmap = mx.nd.multiply(heatmap,
                                                 255.0)  # 0 ~ 255 범위로 바꾸기
                        heatmap = mx.nd.max(
                            heatmap, axis=0,
                            keepdims=True)  # channel 축으로 가장 큰것 뽑기
                        heatmap = mx.nd.transpose(
                            heatmap,
                            axes=(1, 2, 0))  # (height, width, channel=1)
                        heatmap = mx.nd.repeat(
                            heatmap, repeats=3,
                            axis=-1)  # (height, width, channel=3)
                        heatmap = heatmap.asnumpy(
                        )  # mxnet.ndarray -> numpy.ndarray
                        heatmap = cv2.resize(heatmap,
                                             dsize=(input_size[1],
                                                    input_size[0]))  # 사이즈 원복
                        heatmap = heatmap.astype("uint8")  # float32 -> uint8
                        heatmap = cv2.applyColorMap(heatmap, cv2.COLORMAP_JET)
                        heatmap[:, :,
                                (0, 1, 2)] = heatmap[:, :,
                                                     (2, 1, 0)]  # BGR -> RGB
                        heatmap = np.transpose(
                            heatmap,
                            axes=(2, 0, 1))  # (channel=3, height, width)

                        # ground truth box 그리기
                        ground_truth = plot_bbox(
                            ig,
                            gt_box * scale_factor,
                            scores=None,
                            labels=gt_id,
                            thresh=None,
                            reverse_rgb=True,
                            class_names=valid_dataset.classes,
                            absolute_coordinates=True,
                            colors=ground_truth_colors)
                        # prediction box 그리기
                        prediction_box = plot_bbox(
                            ground_truth,
                            bbox,
                            scores=score,
                            labels=id,
                            thresh=plot_class_thresh,
                            reverse_rgb=False,
                            class_names=valid_dataset.classes,
                            absolute_coordinates=True)

                        # Tensorboard에 그리기 위해 BGR -> RGB / (height, width, channel) -> (channel, height, width) 를한다.
                        prediction_box = cv2.cvtColor(prediction_box,
                                                      cv2.COLOR_BGR2RGB)
                        prediction_box = np.transpose(prediction_box,
                                                      axes=(2, 0, 1))
                        batch_image.append(
                            prediction_box)  # (batch, channel, height, width)
                        heatmap_image.append(heatmap)

                all_image = np.concatenate(
                    [np.array(batch_image),
                     np.array(heatmap_image)], axis=-1)
                summary.add_image(tag="valid_result",
                                  image=all_image,
                                  global_step=i)
                summary.add_scalar(tag="heatmap_loss",
                                   value={
                                       "train_heatmap_loss_mean":
                                       train_heatmap_loss_mean,
                                       "valid_heatmap_loss_mean":
                                       valid_heatmap_loss_mean
                                   },
                                   global_step=i)
                summary.add_scalar(tag="offset_loss",
                                   value={
                                       "train_offset_loss_mean":
                                       train_offset_loss_mean,
                                       "valid_offset_loss_mean":
                                       valid_offset_loss_mean
                                   },
                                   global_step=i)
                summary.add_scalar(tag="wh_loss",
                                   value={
                                       "train_wh_loss_mean":
                                       train_wh_loss_mean,
                                       "valid_wh_loss_mean": valid_wh_loss_mean
                                   },
                                   global_step=i)

                summary.add_scalar(tag="total_loss",
                                   value={
                                       "train_total_loss":
                                       train_total_loss_mean,
                                       "valid_total_loss":
                                       valid_total_loss_mean
                                   },
                                   global_step=i)

                params = net.collect_params().values()
                if GPU_COUNT > 1:
                    for c in ctx:
                        for p in params:
                            summary.add_histogram(tag=p.name,
                                                  values=p.data(ctx=c),
                                                  global_step=i,
                                                  bins='default')
                else:
                    for p in params:
                        summary.add_histogram(tag=p.name,
                                              values=p.data(),
                                              global_step=i,
                                              bins='default')

        if i % save_period == 0:

            if not os.path.exists(weight_path):
                os.makedirs(weight_path)
            '''
            Hybrid models can be serialized as JSON files using the export function
            Export HybridBlock to json format that can be loaded by SymbolBlock.imports, mxnet.mod.Module or the C++ interface.
            When there are only one input, it will have name data. When there Are more than one inputs, they will be named as data0, data1, etc.
            '''
            if GPU_COUNT >= 1:
                context = mx.gpu(0)
            else:
                context = mx.cpu(0)

            postnet = PostNet(net=net, auxnet=prediction)  # 새로운 객체가 생성
            try:
                net.export(os.path.join(weight_path, f"{model}"),
                           epoch=i,
                           remove_amp_cast=True)
                net.save_parameters(os.path.join(weight_path,
                                                 f"{i}.params"))  # onnx 추출용
                # network inference, decoder, nms까지 처리됨 - mxnet c++에서 편리함
                export_block_for_cplusplus(
                    path=os.path.join(weight_path, f"{model}_prepost"),
                    block=postnet,
                    data_shape=tuple(input_size) + tuple((3, )),
                    epoch=i,
                    preprocess=
                    True,  # c++ 에서 inference시 opencv에서 읽은 이미지 그대로 넣으면 됨
                    layout='HWC',
                    ctx=context,
                    remove_amp_cast=True)

            except Exception as E:
                logging.error(f"json, param model export 예외 발생 : {E}")
            else:
                logging.info("json, param model export 성공")
                net.collect_params().reset_ctx(ctx)

    end_time = time.time()
    learning_time = end_time - start_time
    logging.info(f"learning time : 약, {learning_time / 3600:0.2f}H")
    logging.info("optimization completed")

    if using_mlflow:
        ml.log_metric("learning time", round(learning_time / 3600, 2))
Exemple #15
0
def train(opts):
    logging.debug(f'Initializing from {opts.cfg}')
    with open(opts.cfg) as fp:
        cfg = yaml.load(fp, yaml.SafeLoader)
        if opts.wandb:
            wandb.config.update(cfg)
        cfg = postprocess(cfg)
        logging.debug(yaml.dump(cfg, default_flow_style=False))
    trainer_cfg = cfg.pop('trainer')
    if trainer_cfg.get('amp', False):
        amp.init()

    net = build_detector(cfg.pop('detector'))
    # net.initialize(ctx=trainer_cfg['ctx'])
    net.initialize(ctx=trainer_cfg['ctx'], init=mx.init.Xavier(magnitude=2.5))
    loss_fn = build_loss(cfg.pop('loss'))
    optimizer = estimator.build_optimizer(cfg.pop('optimizer'), net)
    val_net = create_val_net(net) if trainer_cfg.get('hybridize',
                                                     False) else net

    # save_net_plot(net, opts.vizfile, format='png')
    print_summary(net)

    data_cfg = cfg.pop('dataset')
    # batchify = Tuple([Stack(), Pad(axis=0, pad_val=-1)])
    batchify = Tuple(*([Stack() for _ in range(7)] +
                       [Pad(axis=0, pad_val=-1) for _ in range(1)]))
    train_dataset = build_dataset(data_cfg.pop('train'))
    train_dataset = train_dataset.transform(
        build_transformers(data_cfg.pop('train_transform')))
    val_dataset, val_metric = build_dataset(data_cfg.pop('test'))
    val_dataset = val_dataset.transform(
        build_transformers(data_cfg.pop('test_transform')))

    train_dataloader = DataLoader(train_dataset,
                                  trainer_cfg['batch_size'],
                                  shuffle=True,
                                  last_batch="rollover",
                                  batchify_fn=batchify,
                                  num_workers=trainer_cfg['workers'],
                                  pin_memory=True,
                                  timeout=60 * 60,
                                  prefetch=trainer_cfg['batch_size'] * 3,
                                  thread_pool=False)
    val_dataloader = DataLoader(val_dataset,
                                trainer_cfg['batch_size'],
                                shuffle=False,
                                last_batch='keep',
                                batchify_fn=Tuple(Stack(),
                                                  Pad(axis=0, pad_val=-1)),
                                num_workers=trainer_cfg['workers'],
                                pin_memory=True,
                                timeout=60 * 60,
                                thread_pool=False)
    train_metrics = estimator.build_metric(trainer_cfg.pop('train_metrics'))
    test_metrics = [estimator.metrics.DetectionAPMetric(val_metric)]

    processor = estimator.BatchIterProcessor(
        enable_hybridize=trainer_cfg.get('hybridize', False))

    trainer = Estimator(net,
                        loss_fn,
                        val_net=val_net,
                        train_metrics=train_metrics,
                        val_metrics=test_metrics,
                        trainer=optimizer,
                        context=trainer_cfg['ctx'],
                        batch_processor=processor)

    # initializing handlers
    checkpointer = CheckpointHandler(opts.save_dir,
                                     model_prefix=opts.name,
                                     monitor=test_metrics[0],
                                     verbose=1,
                                     save_best=True,
                                     mode='max',
                                     epoch_period=trainer_cfg['save_interval'],
                                     max_checkpoints=trainer_cfg['max_save'],
                                     resume_from_checkpoint=True)
    exporter = estimator.ExportBestSymbolModelHandler(
        checkpointer=checkpointer)
    # noinspection PyTypeChecker
    train_handlers = [
        checkpointer,
        exporter,
        processor,
        estimator.EmptyContextCacheHandler(),
        # estimator.StoppingOnNanHandler(),
        ValidationHandler(val_dataloader,
                          eval_fn=trainer.evaluate,
                          epoch_period=trainer_cfg['val_interval'],
                          event_handlers=processor),
        LoggingHandler(log_interval=trainer_cfg['log_interval'],
                       log_to_wandb=True,
                       metrics=train_metrics),
        estimator.GradientAccumulateUpdateHandler(trainer_cfg['accumulate']),
    ]

    # logging.warning(f'Initial validating...')
    # trainer.evaluate(val_dataloader)
    trainer.fit(
        train_dataloader,
        val_dataloader,
        event_handlers=train_handlers,
        epochs=trainer_cfg['epochs'],
        # batches=2
    )
Exemple #16
0
def fit(args, model, data_loader):
    """
    train a model
    args : argparse returns
    model : the the neural network model
    data_loader : function that returns the train and val data iterators
    """

    start_time = time.time()

    # select gpu for horovod process
    if 'horovod' in args.kv_store:
        args.gpus = [args.gpus[hvd.local_rank()]]

    if args.amp:
        amp.init()

    if args.seed is not None:
        logging.info('Setting seeds to {}'.format(args.seed))
        random.seed(args.seed)
        np.random.seed(args.seed)
        mx.random.seed(args.seed)

    # kvstore
    if 'horovod' in args.kv_store:
        kv = None
        rank = hvd.rank()
        num_workers = hvd.size()
    else:
        kv = mx.kvstore.create(args.kv_store)
        rank = kv.rank
        num_workers = kv.num_workers

    if args.test_io:
        train, val = data_loader(args, kv)

        if args.test_io_mode == 'train':
            data_iter = train
        else:
            data_iter = val

        tic = time.time()
        for i, batch in enumerate(data_iter):
            if isinstance(batch, list):
                for b in batch:
                    for j in b.data:
                        j.wait_to_read()
            else:
                for j in batch.data:
                    j.wait_to_read()
            if (i + 1) % args.disp_batches == 0:
                logging.info('Batch [{}]\tSpeed: {:.2f} samples/sec'.format(
                    i,
                    args.disp_batches * args.batch_size / (time.time() - tic)))
                tic = time.time()
        return

    if not load_model(args, model):
        # all initializers should be specified in the model definition.
        # if not, this will raise an error
        model.initialize(mx.init.Initializer())

    # devices for training
    devs = list(map(mx.gpu, args.gpus))
    model.collect_params().reset_ctx(devs)

    if args.mode == 'pred':
        logging.info('Infering image {}'.format(args.data_pred))
        model_pred(args, model, data.load_image(args, args.data_pred, devs[0]))
        return

    # learning rate
    lr_scheduler = get_lr_scheduler(args)

    optimizer_params = {
        'learning_rate': 0,
        'wd': args.wd,
        'multi_precision': True,
    }

    # Only a limited number of optimizers have 'momentum' property
    has_momentum = {'sgd', 'dcasgd', 'nag', 'signum', 'lbsgd'}
    if args.optimizer in has_momentum:
        optimizer_params['momentum'] = args.mom

    # evaluation metrices
    if not args.no_metrics:
        eval_metrics = ['accuracy']
        eval_metrics.append(mx.metric.create('top_k_accuracy', top_k=5))
    else:
        eval_metrics = []

    train, val = data_loader(args, kv)
    train = BenchmarkingDataIter(train, args.benchmark_iters)
    if val is not None:
        val = BenchmarkingDataIter(val, args.benchmark_iters)

    if 'horovod' in args.kv_store:
        # Fetch and broadcast parameters
        params = model.collect_params()
        if params is not None:
            hvd.broadcast_parameters(params, root_rank=0)
    global_metrics = CompositeMeter()
    if args.mode in ['train_val', 'train']:
        global_metrics.register_metric('train.loss', MinMeter())
        global_metrics.register_metric('train.ips', AvgMeter())

    if args.mode in ['train_val', 'val']:
        global_metrics.register_metric('val.accuracy', MaxMeter())
        global_metrics.register_metric('val.top_k_accuracy_5', MaxMeter())
        global_metrics.register_metric('val.ips', AvgMeter())
        global_metrics.register_metric('val.latency_avg', AvgMeter())

    if args.mode in ['val']:
        global_metrics.register_metric('val.latency_50', PercentileMeter(50))
        global_metrics.register_metric('val.latency_90', PercentileMeter(90))
        global_metrics.register_metric('val.latency_95', PercentileMeter(95))
        global_metrics.register_metric('val.latency_99', PercentileMeter(99))
        global_metrics.register_metric('val.latency_100', PercentileMeter(100))

    # run
    if args.mode in ['train_val', 'train']:
        model_fit(
            args,
            model,
            train,
            begin_epoch=args.begin_epoch,
            num_epoch=args.num_epochs,
            run_epoch=args.run_epochs,
            eval_data=val,
            eval_metric=eval_metrics,
            global_metrics=global_metrics,
            kvstore=args.kv_store,
            kv=kv,
            optimizer=args.optimizer,
            optimizer_params=optimizer_params,
            lr_scheduler=lr_scheduler,
            model_prefix=os.path.join(args.workspace, args.model_prefix),
        )
    elif args.mode == 'val':
        for epoch in range(args.num_epochs):  # loop for benchmarking
            score, duration_stats, durations = model_score(
                args, model, val, eval_metrics, args.kv_store)
            dllogger_data = dict(
                starmap(lambda key, val: ('val.{}'.format(key), val),
                        zip(*score)))
            dllogger_data.update(
                starmap(lambda key, val: ('val.{}'.format(key), val),
                        duration_stats.items()))
            global_metrics.update_dict(dllogger_data)
            for percentile in [50, 90, 95, 99, 100]:
                metric_name = 'val.latency_{}'.format(percentile)
                dllogger_data[metric_name] = np.percentile(
                    durations, percentile)
                global_metrics.update_metric(metric_name, durations)
            dllogger.log(step=(epoch, ), data=dllogger_data)
    else:
        raise ValueError('Wrong mode')

    mx.nd.waitall()
    dllogger.log(tuple(), data=global_metrics.get())
def main():
    opt = parse_args()

    filehandler = logging.FileHandler(opt.logging_file, mode='a+')
    # streamhandler = logging.StreamHandler()

    logger = logging.getLogger('ImageNet')
    logger.setLevel(level=logging.DEBUG)
    logger.addHandler(filehandler)
    # logger.addHandler(streamhandler)

    logger.info(opt)

    if opt.amp:
        amp.init()

    batch_size = opt.batch_size
    classes = 1000
    num_training_samples = 1281167
    num_validating_samples = 50000

    num_gpus = opt.num_gpus
    batch_size *= max(1, num_gpus)
    context = [mx.gpu(i)
               for i in range(num_gpus)] if num_gpus > 0 else [mx.cpu()]
    num_workers = opt.num_workers
    accumulate = opt.accumulate

    lr_decay = opt.lr_decay
    lr_decay_period = opt.lr_decay_period
    if opt.lr_decay_period > 0:
        lr_decay_epoch = list(
            range(lr_decay_period, opt.num_epochs, lr_decay_period))
    else:
        lr_decay_epoch = [int(i) for i in opt.lr_decay_epoch.split(',')]
    lr_decay_epoch = [e - opt.warmup_epochs for e in lr_decay_epoch]
    num_batches = num_training_samples // batch_size

    lr_scheduler = LRSequential([
        LRScheduler('linear',
                    base_lr=0,
                    target_lr=opt.lr,
                    nepochs=opt.warmup_epochs,
                    iters_per_epoch=num_batches),
        LRScheduler(opt.lr_mode,
                    base_lr=opt.lr,
                    target_lr=0,
                    nepochs=opt.num_epochs - opt.warmup_epochs,
                    iters_per_epoch=num_batches,
                    step_epoch=lr_decay_epoch,
                    step_factor=lr_decay,
                    power=2)
    ])

    model_name = opt.model

    kwargs = {'ctx': context, 'pretrained': opt.use_pretrained}
    if opt.use_gn:
        kwargs['norm_layer'] = gcv.nn.GroupNorm
    if model_name.startswith('vgg'):
        kwargs['batch_norm'] = opt.batch_norm
    elif model_name.startswith('resnext'):
        kwargs['use_se'] = opt.use_se

    if opt.last_gamma:
        kwargs['last_gamma'] = True

    optimizer = 'sgd'
    optimizer_params = {
        'wd': opt.wd,
        'momentum': opt.momentum,
        'lr_scheduler': lr_scheduler,
        'begin_num_update': num_batches * opt.resume_epoch
    }
    # if opt.dtype != 'float32':
    #     optimizer_params['multi_precision'] = True

    # net = get_model(model_name, **kwargs)
    if opt.model_backend == 'gluoncv':
        net = glcv_get_model(model_name, **kwargs)
    elif opt.model_backend == 'gluoncv2':
        net = glcv2_get_model(model_name, **kwargs)
    else:
        raise ValueError(f'Unknown backend: {opt.model_backend}')
    # net.cast(opt.dtype)
    if opt.resume_params != '':
        net.load_parameters(opt.resume_params, ctx=context, cast_dtype=True)

    # teacher model for distillation training
    if opt.teacher is not None and opt.hard_weight < 1.0:
        teacher_name = opt.teacher
        if opt.teacher_backend == 'gluoncv':
            teacher = glcv_get_model(teacher_name, **kwargs)
        elif opt.teacher_backend == 'gluoncv2':
            teacher = glcv2_get_model(teacher_name, **kwargs)
        else:
            raise ValueError(f'Unknown backend: {opt.teacher_backend}')
        # teacher = glcv2_get_model(teacher_name, pretrained=True, ctx=context)
        # teacher.cast(opt.dtype)
        teacher.collect_params().setattr('grad_req', 'null')
        distillation = True
    else:
        distillation = False

    # Two functions for reading data from record file or raw images
    def get_data_rec(rec_train, rec_val):
        rec_train = os.path.expanduser(rec_train)
        rec_val = os.path.expanduser(rec_val)

        # mean_rgb = [123.68, 116.779, 103.939]
        # std_rgb = [58.393, 57.12, 57.375]

        train_dataset = ImageRecordDataset(filename=rec_train, flag=1)
        val_dataset = ImageRecordDataset(filename=rec_val, flag=1)
        return train_dataset, val_dataset

    def get_data_loader(data_dir):
        train_dataset = ImageNet(data_dir, train=True)
        val_dataset = ImageNet(data_dir, train=False)
        return train_dataset, val_dataset

    def batch_fn(batch, ctx):
        data = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0)
        label = gluon.utils.split_and_load(batch[1],
                                           ctx_list=ctx,
                                           batch_axis=0)
        return data, label

    if opt.use_rec:
        train_dataset, val_dataset = get_data_rec(opt.rec_train, opt.rec_val)
    else:
        train_dataset, val_dataset = get_data_loader(opt.data_dir)

    normalize = transforms.Normalize([0.485, 0.456, 0.406],
                                     [0.229, 0.224, 0.225])
    jitter_param = 0.4
    lighting_param = 0.1
    if not opt.multi_scale:
        train_dataset = train_dataset.transform_first(
            transforms.Compose([
                transforms.RandomResizedCrop(opt.input_size),
                transforms.RandomFlipLeftRight(),
                transforms.RandomColorJitter(brightness=jitter_param,
                                             contrast=jitter_param,
                                             saturation=jitter_param),
                transforms.RandomLighting(lighting_param),
                transforms.ToTensor(), normalize
            ]))
        train_data = gluon.data.DataLoader(train_dataset,
                                           batch_size=batch_size,
                                           shuffle=True,
                                           pin_memory=True,
                                           last_batch='rollover',
                                           num_workers=num_workers)
    else:
        train_data = RandomTransformDataLoader(
            [
                Transform(
                    transforms.Compose([
                        # transforms.RandomResizedCrop(opt.input_size),
                        transforms.RandomResizedCrop(x * 32),
                        transforms.RandomFlipLeftRight(),
                        transforms.RandomColorJitter(brightness=jitter_param,
                                                     contrast=jitter_param,
                                                     saturation=jitter_param),
                        transforms.RandomLighting(lighting_param),
                        transforms.ToTensor(),
                        normalize
                    ])) for x in range(10, 20)
            ],
            train_dataset,
            interval=10 * opt.accumulate,
            batch_size=batch_size,
            shuffle=False,
            pin_memory=True,
            last_batch='rollover',
            num_workers=num_workers)
    val_dataset = val_dataset.transform_first(
        transforms.Compose([
            transforms.Resize(opt.input_size, keep_ratio=True),
            transforms.CenterCrop(opt.input_size),
            transforms.ToTensor(), normalize
        ]))
    val_data = gluon.data.DataLoader(val_dataset,
                                     batch_size=batch_size,
                                     shuffle=False,
                                     pin_memory=True,
                                     last_batch='keep',
                                     num_workers=num_workers)

    if opt.mixup:
        train_metric = mx.metric.RMSE()
    else:
        train_metric = mx.metric.Accuracy()
    train_loss_metric = mx.metric.Loss()
    acc_top1 = mx.metric.Accuracy()
    acc_top5 = mx.metric.TopKAccuracy(5)

    save_frequency = opt.save_frequency
    if opt.save_dir and save_frequency:
        if opt.wandb:
            save_dir = wandb.run.dir
        else:
            save_dir = opt.save_dir
            makedirs(save_dir)
    else:
        save_dir = ''
        save_frequency = 0

    def mixup_transform(label, classes, lam=1, eta=0.0):
        if isinstance(label, nd.NDArray):
            label = [label]
        res = []
        for l in label:
            y1 = l.one_hot(classes,
                           on_value=1 - eta + eta / classes,
                           off_value=eta / classes)
            y2 = l[::-1].one_hot(classes,
                                 on_value=1 - eta + eta / classes,
                                 off_value=eta / classes)
            res.append(lam * y1 + (1 - lam) * y2)
        return res

    def smooth(label, classes, eta=0.1):
        if isinstance(label, nd.NDArray):
            label = [label]
        smoothed = []
        for l in label:
            res = l.one_hot(classes,
                            on_value=1 - eta + eta / classes,
                            off_value=eta / classes)
            smoothed.append(res)
        return smoothed

    def test(ctx, val_data):
        acc_top1.reset()
        acc_top5.reset()
        for i, batch in tqdm.tqdm(enumerate(val_data),
                                  desc='Validating',
                                  total=num_validating_samples // batch_size):
            data, label = batch_fn(batch, ctx)
            # outputs = [net(X.astype(opt.dtype, copy=False)) for X in data]
            outputs = [net(X) for X in data]
            acc_top1.update(label, outputs)
            acc_top5.update(label, outputs)

        _, top1 = acc_top1.get()
        _, top5 = acc_top5.get()
        return 1 - top1, 1 - top5

    def train(ctx):
        if isinstance(ctx, mx.Context):
            ctx = [ctx]
        if opt.resume_params == '':
            import warnings
            with warnings.catch_warnings(record=True) as w:
                net.initialize(mx.init.MSRAPrelu(), ctx=ctx)

        if opt.no_wd:
            for k, v in net.collect_params('.*beta|.*gamma|.*bias').items():
                v.wd_mult = 0.0

        if accumulate > 1:
            logger.info(f'accumulate: {accumulate}, using "add" grad_req')
            import warnings
            with warnings.catch_warnings(record=True) as w:
                net.collect_params().setattr('grad_req', 'add')

        trainer = gluon.Trainer(net.collect_params(),
                                optimizer,
                                optimizer_params,
                                update_on_kvstore=False if opt.amp else None)
        if opt.amp:
            amp.init_trainer(trainer)
        if opt.resume_states != '':
            trainer.load_states(opt.resume_states)

        if opt.label_smoothing or opt.mixup:
            sparse_label_loss = False
        else:
            sparse_label_loss = True
        if distillation:
            L = gcv.loss.DistillationSoftmaxCrossEntropyLoss(
                temperature=opt.temperature,
                hard_weight=opt.hard_weight,
                sparse_label=sparse_label_loss)
        else:
            L = gluon.loss.SoftmaxCrossEntropyLoss(
                sparse_label=sparse_label_loss)

        best_val_score = 1

        err_top1_val, err_top5_val = test(ctx, val_data)
        logger.info('initial validation: err-top1=%f err-top5=%f' %
                    (err_top1_val, err_top5_val))

        for epoch in range(opt.resume_epoch, opt.num_epochs):
            tic = time.time()
            train_metric.reset()
            train_loss_metric.reset()
            btic = time.time()
            pbar = tqdm.tqdm(total=num_batches,
                             desc=f'Training [{epoch}]',
                             leave=True)
            for i, batch in enumerate(train_data):
                data, label = batch_fn(batch, ctx)

                if opt.mixup:
                    lam = np.random.beta(opt.mixup_alpha, opt.mixup_alpha)
                    if epoch >= opt.num_epochs - opt.mixup_off_epoch:
                        lam = 1
                    data = [lam * X + (1 - lam) * X[::-1] for X in data]

                    if opt.label_smoothing:
                        eta = 0.1
                    else:
                        eta = 0.0
                    label = mixup_transform(label, classes, lam, eta)

                elif opt.label_smoothing:
                    hard_label = label
                    label = smooth(label, classes)

                if distillation:
                    # teacher_prob = [nd.softmax(teacher(X.astype(opt.dtype, copy=False)) / opt.temperature) \
                    #                 for X in data]
                    with ag.predict_mode():
                        teacher_prob = [
                            nd.softmax(
                                teacher(
                                    nd.transpose(
                                        nd.image.resize(
                                            nd.transpose(X, (0, 2, 3, 1)),
                                            size=opt.teacher_imgsize),
                                        (0, 3, 1, 2))) / opt.temperature)
                            for X in data
                        ]

                with ag.record():
                    # outputs = [net(X.astype(opt.dtype, copy=False)) for X in data]
                    outputs = [net(X) for X in data]
                    if distillation:
                        # loss = [L(yhat.astype('float32', copy=False),
                        #           y.astype('float32', copy=False),
                        #           p.astype('float32', copy=False)) for yhat, y, p in zip(outputs, label, teacher_prob)]
                        # print([outputs, label, teacher_prob])
                        loss = [
                            L(yhat, y, p)
                            for yhat, y, p in zip(outputs, label, teacher_prob)
                        ]
                    else:
                        # loss = [L(yhat, y.astype(opt.dtype, copy=False)) for yhat, y in zip(outputs, label)]
                        loss = [L(yhat, y) for yhat, y in zip(outputs, label)]
                    if opt.amp:
                        with amp.scale_loss(loss, trainer) as scaled_loss:
                            ag.backward(scaled_loss)
                    else:
                        ag.backward(loss)
                if accumulate > 1:
                    if (i + 1) % accumulate == 0:
                        trainer.step(batch_size * accumulate)
                        net.collect_params().zero_grad()
                else:
                    trainer.step(batch_size)

                train_loss_metric.update(0, loss)

                if opt.mixup:
                    output_softmax = [nd.SoftmaxActivation(out.astype('float32', copy=False)) \
                                      for out in outputs]
                    train_metric.update(label, output_softmax)
                else:
                    if opt.label_smoothing:
                        train_metric.update(hard_label, outputs)
                    else:
                        train_metric.update(label, outputs)

                _, loss_score = train_loss_metric.get()
                train_metric_name, train_metric_score = train_metric.get()
                samplers_per_sec = batch_size / (time.time() - btic)
                postfix = f'{samplers_per_sec:.1f} imgs/sec, ' \
                          f'loss: {loss_score:.4f}, ' \
                          f'acc: {train_metric_score * 100:.2f}, ' \
                          f'lr: {trainer.learning_rate:.4e}'
                if opt.multi_scale:
                    postfix += f', size: {data[0].shape[-1]}'
                pbar.set_postfix_str(postfix)
                pbar.update()
                btic = time.time()
                if opt.log_interval and not (i + 1) % opt.log_interval:
                    step = epoch * num_batches + i
                    wandb.log(
                        {
                            'samplers_per_sec': samplers_per_sec,
                            train_metric_name: train_metric_score,
                            'lr': trainer.learning_rate,
                            'loss': loss_score
                        },
                        step=step)
                    logger.info(
                        'Epoch[%d] Batch [%d]\tSpeed: %f samples/sec\t%s=%f\tlr=%f'
                        % (epoch, i, samplers_per_sec, train_metric_name,
                           train_metric_score, trainer.learning_rate))

            pbar.close()
            train_metric_name, train_metric_score = train_metric.get()
            throughput = int(batch_size * i / (time.time() - tic))

            err_top1_val, err_top5_val = test(ctx, val_data)
            wandb.log({
                'err1': err_top1_val,
                'err5': err_top5_val
            },
                      step=epoch * num_batches)

            logger.info('[Epoch %d] training: %s=%f' %
                        (epoch, train_metric_name, train_metric_score))
            logger.info('[Epoch %d] speed: %d samples/sec\ttime cost: %f' %
                        (epoch, throughput, time.time() - tic))
            logger.info('[Epoch %d] validation: err-top1=%f err-top5=%f' %
                        (epoch, err_top1_val, err_top5_val))

            if err_top1_val < best_val_score:
                best_val_score = err_top1_val
                net.save_parameters(
                    '%s/%.4f-imagenet-%s-%d-best.params' %
                    (save_dir, best_val_score, model_name, epoch))
                trainer.save_states(
                    '%s/%.4f-imagenet-%s-%d-best.states' %
                    (save_dir, best_val_score, model_name, epoch))

            if save_frequency and save_dir and (epoch +
                                                1) % save_frequency == 0:
                net.save_parameters('%s/imagenet-%s-%d.params' %
                                    (save_dir, model_name, epoch))
                trainer.save_states('%s/imagenet-%s-%d.states' %
                                    (save_dir, model_name, epoch))

        if save_frequency and save_dir:
            net.save_parameters('%s/imagenet-%s-%d.params' %
                                (save_dir, model_name, opt.num_epochs - 1))
            trainer.save_states('%s/imagenet-%s-%d.states' %
                                (save_dir, model_name, opt.num_epochs - 1))

    if opt.mode == 'hybrid':
        net.hybridize(static_alloc=True, static_shape=not opt.multi_scale)
        if distillation:
            teacher.hybridize(static_alloc=True,
                              static_shape=not opt.multi_scale)
    train(context)
Exemple #18
0
                val_msg = '\n'.join(
                    ['{}={}'.format(k, v) for k, v in zip(map_name, mean_ap)])
                logger.info('[Epoch {}] Validation: \n{}'.format(
                    epoch, val_msg))
                current_map = float(mean_ap[-1])
            else:
                current_map = 0.
            save_params(net, best_map, current_map, epoch, args.save_interval,
                        args.save_prefix)


if __name__ == '__main__':
    args = parse_args()

    if args.amp:
        amp.init()

    if args.horovod:
        if hvd is None:
            raise SystemExit(
                "Horovod not found, please check if you installed it correctly."
            )
        hvd.init()

    # fix seed for mxnet, numpy and python builtin random generator.
    gutils.random.seed(args.seed)

    # training contexts
    if args.horovod:
        ctx = [mx.gpu(hvd.local_rank())]
    else:
Exemple #19
0
def run(mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225],
        anchor_alloc_size=[256, 256],
        anchor_sizes=[32, 64, 128, 256, 512],
        anchor_size_ratios=[1, pow(2, 1 / 3), pow(2, 2 / 3)],
        anchor_aspect_ratios=[0.5, 1, 2],
        anchor_box_clip=True,
        graphviz=True,
        epoch=100,
        input_size=[512, 512],
        batch_log=100,
        batch_size=16,
        batch_interval=10,
        subdivision=4,
        train_dataset_path="Dataset/train",
        valid_dataset_path="Dataset/valid",
        multiscale=True,
        factor_scale=[8, 5],
        foreground_iou_thresh=0.5,
        background_iou_thresh=0.4,
        data_augmentation=True,
        num_workers=4,
        optimizer="ADAM",
        weight_decay=0.000001,
        save_period=5,
        load_period=10,
        learning_rate=0.001,
        decay_lr=0.999,
        decay_step=10,
        GPU_COUNT=0,
        base=0,
        AMP=True,
        valid_size=8,
        eval_period=5,
        tensorboard=True,
        valid_graph_path="valid_Graph",
        valid_html_auto_open=True,
        using_mlflow=True,
        decode_number=5000,
        multiperclass=True,
        nms_thresh=0.5,
        nms_topk=500,
        iou_thresh=0.5,
        except_class_thresh=0.05,
        plot_class_thresh=0.5):
    if GPU_COUNT == 0:
        ctx = mx.cpu(0)
        AMP = False
    elif GPU_COUNT == 1:
        ctx = mx.gpu(0)
    else:
        ctx = [mx.gpu(i) for i in range(GPU_COUNT)]

    # 운영체제 확인
    if platform.system() == "Linux":
        logging.info(f"{platform.system()} OS")
    elif platform.system() == "Windows":
        logging.info(f"{platform.system()} OS")
    else:
        logging.info(f"{platform.system()} OS")

    if isinstance(ctx, (list, tuple)):
        for i, c in enumerate(ctx):
            free_memory, total_memory = mx.context.gpu_memory_info(i)
            free_memory = round(free_memory / (1024 * 1024 * 1024), 2)
            total_memory = round(total_memory / (1024 * 1024 * 1024), 2)
            logging.info(
                f'Running on {c} / free memory : {free_memory}GB / total memory {total_memory}GB'
            )
    else:
        if GPU_COUNT == 1:
            free_memory, total_memory = mx.context.gpu_memory_info(0)
            free_memory = round(free_memory / (1024 * 1024 * 1024), 2)
            total_memory = round(total_memory / (1024 * 1024 * 1024), 2)
            logging.info(
                f'Running on {ctx} / free memory : {free_memory}GB / total memory {total_memory}GB'
            )
        else:
            logging.info(f'Running on {ctx}')

    if GPU_COUNT > 0 and batch_size < GPU_COUNT:
        logging.info("batch size must be greater than gpu number")
        exit(0)

    if AMP:
        amp.init()

    if multiscale:
        logging.info("Using MultiScale")

    if data_augmentation:
        logging.info("Using Data Augmentation")

    logging.info("training Efficient Detector")
    input_shape = (1, 3) + tuple(input_size)

    net = Efficient(version=base,
                    anchor_sizes=anchor_sizes,
                    anchor_size_ratios=anchor_size_ratios,
                    anchor_aspect_ratios=anchor_aspect_ratios,
                    anchor_box_clip=anchor_box_clip,
                    alloc_size=anchor_alloc_size,
                    ctx=mx.cpu())
    train_dataloader, train_dataset = traindataloader(
        multiscale=multiscale,
        factor_scale=factor_scale,
        augmentation=data_augmentation,
        path=train_dataset_path,
        input_size=input_size,
        batch_size=batch_size,
        batch_interval=batch_interval,
        num_workers=num_workers,
        shuffle=True,
        mean=mean,
        std=std,
        net=net,
        foreground_iou_thresh=foreground_iou_thresh,
        background_iou_thresh=background_iou_thresh,
        make_target=True)

    train_update_number_per_epoch = len(train_dataloader)
    if train_update_number_per_epoch < 1:
        logging.warning("train batch size가 데이터 수보다 큼")
        exit(0)

    valid_list = glob.glob(os.path.join(valid_dataset_path, "*"))
    if valid_list:
        valid_dataloader, valid_dataset = validdataloader(
            path=valid_dataset_path,
            input_size=input_size,
            batch_size=valid_size,
            num_workers=num_workers,
            shuffle=True,
            mean=mean,
            std=std,
            net=net,
            foreground_iou_thresh=foreground_iou_thresh,
            background_iou_thresh=background_iou_thresh,
            make_target=True)
        valid_update_number_per_epoch = len(valid_dataloader)
        if valid_update_number_per_epoch < 1:
            logging.warning("valid batch size가 데이터 수보다 큼")
            exit(0)

    num_classes = train_dataset.num_class  # 클래스 수
    name_classes = train_dataset.classes

    optimizer = optimizer.upper()
    model = str(input_size[0]) + "_" + str(
        input_size[1]) + "_" + optimizer + "_EFF_" + str(base)

    weight_path = os.path.join("weights", f"{model}")
    sym_path = os.path.join(weight_path, f'{model}-symbol.json')
    param_path = os.path.join(weight_path, f'{model}-{load_period:04d}.params')
    optimizer_path = os.path.join(weight_path,
                                  f'{model}-{load_period:04d}.opt')

    if os.path.exists(param_path) and os.path.exists(sym_path):
        start_epoch = load_period
        logging.info(f"loading {os.path.basename(param_path)}\n")
        net = gluon.SymbolBlock.imports(sym_path, ['data'],
                                        param_path,
                                        ctx=ctx)
    else:
        start_epoch = 0
        net = Efficient(
            version=base,
            input_size=input_size,
            anchor_sizes=anchor_sizes,
            anchor_size_ratios=anchor_size_ratios,
            anchor_aspect_ratios=anchor_aspect_ratios,
            num_classes=num_classes,  # foreground만
            anchor_box_clip=anchor_box_clip,
            alloc_size=anchor_alloc_size,
            ctx=ctx)

        if isinstance(ctx, (list, tuple)):
            net.summary(mx.nd.ones(shape=input_shape, ctx=ctx[0]))
        else:
            net.summary(mx.nd.ones(shape=input_shape, ctx=ctx))
        '''
        active (bool, default True) – Whether to turn hybrid on or off.
        static_alloc (bool, default False) – Statically allocate memory to improve speed. Memory usage may increase.
        static_shape (bool, default False) – Optimize for invariant input shapes between iterations. Must also set static_alloc to True. Change of input shapes is still allowed but slower.
        '''
        if multiscale:
            net.hybridize(active=True, static_alloc=True, static_shape=False)
        else:
            net.hybridize(active=True, static_alloc=True, static_shape=True)

    if start_epoch + 1 >= epoch + 1:
        logging.info("this model has already been optimized")
        exit(0)

    if tensorboard:
        summary = SummaryWriter(logdir=os.path.join("mxboard", model),
                                max_queue=10,
                                flush_secs=10,
                                verbose=False)
        if isinstance(ctx, (list, tuple)):
            net.forward(mx.nd.ones(shape=input_shape, ctx=ctx[0]))
        else:
            net.forward(mx.nd.ones(shape=input_shape, ctx=ctx))
        summary.add_graph(net)
    if graphviz:
        gluoncv.utils.viz.plot_network(net,
                                       shape=input_shape,
                                       save_prefix=model)

    # optimizer
    unit = 1 if (len(train_dataset) //
                 batch_size) < 1 else len(train_dataset) // batch_size
    step = unit * decay_step
    lr_sch = mx.lr_scheduler.FactorScheduler(step=step,
                                             factor=decay_lr,
                                             stop_factor_lr=1e-12,
                                             base_lr=learning_rate)

    for p in net.collect_params().values():
        if p.grad_req != "null":
            p.grad_req = 'add'
    '''
    update_on_kvstore : bool, default None
    Whether to perform parameter updates on kvstore. If None, then trainer will choose the more
    suitable option depending on the type of kvstore. If the `update_on_kvstore` argument is
    provided, environment variable `MXNET_UPDATE_ON_KVSTORE` will be ignored.
    '''
    if optimizer.upper() == "ADAM":
        trainer = gluon.Trainer(net.collect_params(),
                                optimizer,
                                optimizer_params={
                                    "learning_rate": learning_rate,
                                    "lr_scheduler": lr_sch,
                                    "wd": weight_decay,
                                    "beta1": 0.9,
                                    "beta2": 0.999,
                                    'multi_precision': False
                                },
                                update_on_kvstore=False
                                if AMP else None)  # for Dynamic loss scaling
    elif optimizer.upper() == "RMSPROP":
        trainer = gluon.Trainer(net.collect_params(),
                                optimizer,
                                optimizer_params={
                                    "learning_rate": learning_rate,
                                    "lr_scheduler": lr_sch,
                                    "wd": weight_decay,
                                    "gamma1": 0.9,
                                    "gamma2": 0.999,
                                    'multi_precision': False
                                },
                                update_on_kvstore=False
                                if AMP else None)  # for Dynamic loss scaling
    elif optimizer.upper() == "SGD":
        trainer = gluon.Trainer(net.collect_params(),
                                optimizer,
                                optimizer_params={
                                    "learning_rate": learning_rate,
                                    "lr_scheduler": lr_sch,
                                    "wd": weight_decay,
                                    "momentum": 0.9,
                                    'multi_precision': False
                                },
                                update_on_kvstore=False
                                if AMP else None)  # for Dynamic loss scaling
    else:
        logging.error("optimizer not selected")
        exit(0)

    if AMP:
        amp.init_trainer(trainer)

    # optimizer weight 불러오기
    if os.path.exists(optimizer_path):
        try:
            trainer.load_states(optimizer_path)
        except Exception as E:
            logging.info(E)
        else:
            logging.info(f"loading {os.path.basename(optimizer_path)}\n")
    '''
    localization loss -> Smooth L1 loss 
    confidence loss -> Focal 
    '''
    confidence_loss = FocalLoss(alpha=0.25,
                                gamma=2,
                                sparse_label=True,
                                from_sigmoid=False,
                                batch_axis=None,
                                num_class=num_classes,
                                reduction="sum",
                                exclude=False)

    localization_loss = HuberLoss(rho=1,
                                  batch_axis=None,
                                  reduction="sum",
                                  exclude=False)

    prediction = Prediction(batch_size=batch_size,
                            from_sigmoid=False,
                            num_classes=num_classes,
                            decode_number=decode_number,
                            nms_thresh=nms_thresh,
                            nms_topk=nms_topk,
                            except_class_thresh=except_class_thresh,
                            multiperclass=multiperclass)

    precision_recall = Voc_2007_AP(iou_thresh=iou_thresh,
                                   class_names=name_classes)

    ctx_list = ctx if isinstance(ctx, (list, tuple)) else [ctx]
    start_time = time.time()
    for i in tqdm(range(start_epoch + 1, epoch + 1, 1),
                  initial=start_epoch + 1,
                  total=epoch):

        conf_loss_sum = 0
        loc_loss_sum = 0
        time_stamp = time.time()

        for batch_count, (image, _, cls_all, box_all,
                          _) in enumerate(train_dataloader, start=1):
            td_batch_size = image.shape[0]

            image = mx.nd.split(data=image, num_outputs=subdivision, axis=0)
            cls_all = mx.nd.split(data=cls_all,
                                  num_outputs=subdivision,
                                  axis=0)
            box_all = mx.nd.split(data=box_all,
                                  num_outputs=subdivision,
                                  axis=0)

            if subdivision == 1:
                image = [image]
                cls_all = [cls_all]
                box_all = [box_all]
            '''
            autograd 설명
            https://mxnet.apache.org/api/python/docs/tutorials/getting-started/crash-course/3-autograd.html
            '''
            with autograd.record(train_mode=True):

                cls_all_losses = []
                box_all_losses = []

                for image_split, cls_split, box_split in zip(
                        image, cls_all, box_all):

                    image_split = gluon.utils.split_and_load(image_split,
                                                             ctx_list,
                                                             even_split=False)
                    cls_split = gluon.utils.split_and_load(cls_split,
                                                           ctx_list,
                                                           even_split=False)
                    box_split = gluon.utils.split_and_load(box_split,
                                                           ctx_list,
                                                           even_split=False)

                    # prediction, target space for Data Parallelism
                    cls_losses = []
                    box_losses = []
                    total_loss = []

                    # gpu N 개를 대비한 코드 (Data Parallelism)
                    for img, cls_target, box_target in zip(
                            image_split, cls_split, box_split):
                        cls_pred, box_pred, anchor = net(img)
                        except_ignore_samples = cls_target > -1
                        positive_samples = cls_target > 0
                        positive_numbers = positive_samples.sum()

                        conf_loss = confidence_loss(
                            cls_pred, cls_target,
                            except_ignore_samples.expand_dims(axis=-1))
                        conf_loss = mx.nd.divide(conf_loss,
                                                 positive_numbers + 1)
                        cls_losses.append(conf_loss.asscalar())

                        loc_loss = localization_loss(
                            box_pred, box_target,
                            positive_samples.expand_dims(axis=-1))
                        box_losses.append(loc_loss.asscalar())

                        total_loss.append(conf_loss + loc_loss)
                    if AMP:
                        with amp.scale_loss(total_loss,
                                            trainer) as scaled_loss:
                            autograd.backward(scaled_loss)
                    else:
                        autograd.backward(total_loss)

                    cls_all_losses.append(sum(cls_losses))
                    box_all_losses.append(sum(box_losses))

            trainer.step(batch_size=td_batch_size, ignore_stale_grad=False)
            # 비우기
            for p in net.collect_params().values():
                p.zero_grad()

            conf_loss_sum += sum(cls_all_losses) / td_batch_size
            loc_loss_sum += sum(box_all_losses) / td_batch_size

            if batch_count % batch_log == 0:
                logging.info(
                    f'[Epoch {i}][Batch {batch_count}/{train_update_number_per_epoch}],'
                    f'[Speed {td_batch_size / (time.time() - time_stamp):.3f} samples/sec],'
                    f'[Lr = {trainer.learning_rate}]'
                    f'[confidence loss = {sum(cls_all_losses) / td_batch_size:.3f}]'
                    f'[localization loss = {sum(box_all_losses) / td_batch_size:.3f}]'
                )
            time_stamp = time.time()

        train_conf_loss_mean = np.divide(conf_loss_sum,
                                         train_update_number_per_epoch)
        train_loc_loss_mean = np.divide(loc_loss_sum,
                                        train_update_number_per_epoch)
        train_total_loss_mean = train_conf_loss_mean + train_loc_loss_mean

        logging.info(
            f"train confidence loss : {train_conf_loss_mean} / train localization loss : {train_loc_loss_mean} / train total loss : {train_total_loss_mean}"
        )

        if i % save_period == 0:

            weight_epoch_path = os.path.join(weight_path, str(i))
            if not os.path.exists(weight_epoch_path):
                os.makedirs(weight_epoch_path)

            # optimizer weight 저장하기
            try:
                trainer.save_states(
                    os.path.join(weight_path, f'{model}-{i:04d}.opt'))
            except Exception as E:
                logging.error(f"optimizer weight export 예외 발생 : {E}")
            else:
                logging.info("optimizer weight export 성공")
            '''
            Hybrid models can be serialized as JSON files using the export function
            Export HybridBlock to json format that can be loaded by SymbolBlock.imports, mxnet.mod.Module or the C++ interface.
            When there are only one input, it will have name data. When there Are more than one inputs, they will be named as data0, data1, etc.
            '''
            if GPU_COUNT >= 1:
                context = mx.gpu(0)
            else:
                context = mx.cpu(0)
            '''
                mxnet1.6.0 버전 에서 AMP 사용시 위에 미리 선언한 prediction을 사용하면 문제가 될 수 있다. 
                -yolo v3, gaussian yolo v3 에서는 문제가 발생한다.
                mxnet 1.5.x 버전에서는 아래와 같이 새로 선언하지 않아도 정상 동작한다.  

                block들은 함수 인자로 보낼 경우 자기 자신이 보내진다.(복사되는 것이 아님)
                export_block_for_cplusplus 에서 prediction 이 hybridize 되면서 
                미리 선언한 prediction도 hybridize화 되면서 symbol 형태가 된다. 
                이런 현상을 보면 아래와같이 다시 선언해 주는게 맞는 것 같다.
            '''
            auxnet = Prediction(from_sigmoid=False,
                                num_classes=num_classes,
                                decode_number=decode_number,
                                nms_thresh=nms_thresh,
                                nms_topk=nms_topk,
                                except_class_thresh=except_class_thresh,
                                multiperclass=multiperclass)
            postnet = PostNet(net=net, auxnet=auxnet)
            try:
                net.export(os.path.join(weight_path, f"{model}"),
                           epoch=i,
                           remove_amp_cast=True)
                net.save_parameters(os.path.join(weight_path,
                                                 f"{i}.params"))  # onnx 추출용
                # network inference, decoder, nms까지 처리됨 - mxnet c++에서 편리함
                export_block_for_cplusplus(
                    path=os.path.join(weight_epoch_path, f"{model}_prepost"),
                    block=postnet,
                    data_shape=tuple(input_size) + tuple((3, )),
                    epoch=i,
                    preprocess=
                    True,  # c++ 에서 inference시 opencv에서 읽은 이미지 그대로 넣으면 됨
                    layout='HWC',
                    ctx=context,
                    remove_amp_cast=True)

            except Exception as E:
                logging.error(f"json, param model export 예외 발생 : {E}")
            else:
                logging.info("json, param model export 성공")
                net.collect_params().reset_ctx(ctx)

        if i % eval_period == 0 and valid_list:

            conf_loss_sum = 0
            loc_loss_sum = 0

            # loss 구하기
            for image, label, cls_all, box_all, _ in valid_dataloader:

                vd_batch_size = image.shape[0]

                image = gluon.utils.split_and_load(image,
                                                   ctx_list,
                                                   even_split=False)
                label = gluon.utils.split_and_load(label,
                                                   ctx_list,
                                                   even_split=False)
                cls_all = gluon.utils.split_and_load(cls_all,
                                                     ctx_list,
                                                     even_split=False)
                box_all = gluon.utils.split_and_load(box_all,
                                                     ctx_list,
                                                     even_split=False)

                # prediction, target space for Data Parallelism
                cls_losses = []
                box_losses = []

                # gpu N 개를 대비한 코드 (Data Parallelism)
                for img, lb, cls_target, box_target in zip(
                        image, label, cls_all, box_all):
                    gt_box = lb[:, :, :4]
                    gt_id = lb[:, :, 4:5]
                    cls_pred, box_pred, anchor = net(img)
                    id, score, bbox = prediction(cls_pred, box_pred, anchor)

                    precision_recall.update(pred_bboxes=bbox,
                                            pred_labels=id,
                                            pred_scores=score,
                                            gt_boxes=gt_box,
                                            gt_labels=gt_id)

                    except_ignore_samples = cls_target > -1
                    positive_samples = cls_target > 0
                    positive_numbers = positive_samples.sum()

                    conf_loss = confidence_loss(
                        cls_pred, cls_target,
                        except_ignore_samples.expand_dims(axis=-1))
                    conf_loss = mx.nd.divide(conf_loss, positive_numbers + 1)
                    cls_losses.append(conf_loss.asscalar())

                    loc_loss = localization_loss(
                        box_pred, box_target,
                        positive_samples.expand_dims(axis=-1))
                    box_losses.append(loc_loss.asscalar())

                conf_loss_sum += sum(cls_losses) / vd_batch_size
                loc_loss_sum += sum(box_losses) / vd_batch_size

            valid_conf_loss_mean = np.divide(conf_loss_sum,
                                             valid_update_number_per_epoch)
            valid_loc_loss_mean = np.divide(loc_loss_sum,
                                            valid_update_number_per_epoch)
            valid_total_loss_mean = valid_conf_loss_mean + valid_loc_loss_mean

            logging.info(
                f"valid confidence loss : {valid_conf_loss_mean} / valid localization loss : {valid_loc_loss_mean} / valid total loss : {valid_total_loss_mean}"
            )

            AP_appender = []
            round_position = 2
            class_name, precision, recall, true_positive, false_positive, threshold = precision_recall.get_PR_list(
            )
            for j, c, p, r in zip(range(len(recall)), class_name, precision,
                                  recall):
                name, AP = precision_recall.get_AP(c, p, r)
                logging.info(
                    f"class {j}'s {name} AP : {round(AP * 100, round_position)}%"
                )
                AP_appender.append(AP)

            AP_appender = np.nan_to_num(AP_appender)
            mAP_result = np.mean(AP_appender)

            logging.info(f"mAP : {round(mAP_result * 100, round_position)}%")
            precision_recall.get_PR_curve(name=class_name,
                                          precision=precision,
                                          recall=recall,
                                          threshold=threshold,
                                          AP=AP_appender,
                                          mAP=mAP_result,
                                          folder_name=valid_graph_path,
                                          epoch=i,
                                          auto_open=valid_html_auto_open)
            precision_recall.reset()

            if tensorboard:
                # gpu N 개를 대비한 코드 (Data Parallelism)
                dataloader_iter = iter(valid_dataloader)
                image, label, _, _, _ = next(dataloader_iter)

                image = gluon.utils.split_and_load(image,
                                                   ctx_list,
                                                   even_split=False)
                label = gluon.utils.split_and_load(label,
                                                   ctx_list,
                                                   even_split=False)

                ground_truth_colors = {}
                for k in range(num_classes):
                    ground_truth_colors[k] = (0, 1, 0)

                batch_image = []
                for img, lb in zip(image, label):
                    gt_boxes = lb[:, :, :4]
                    gt_ids = lb[:, :, 4:5]
                    cls_pred, box_pred, anchor = net(img)
                    ids, scores, bboxes = prediction(cls_pred, box_pred,
                                                     anchor)

                    for ig, gt_id, gt_box, id, score, bbox in zip(
                            img, gt_ids, gt_boxes, ids, scores, bboxes):
                        ig = ig.transpose((1, 2, 0)) * mx.nd.array(
                            std, ctx=ig.context) + mx.nd.array(mean,
                                                               ctx=ig.context)
                        ig = (ig * 255).clip(0, 255)
                        ig = ig.astype(np.uint8)

                        # ground truth box 그리기
                        ground_truth = plot_bbox(
                            ig,
                            gt_box,
                            scores=None,
                            labels=gt_id,
                            thresh=None,
                            reverse_rgb=False,
                            class_names=valid_dataset.classes,
                            absolute_coordinates=True,
                            colors=ground_truth_colors)
                        # prediction box 그리기
                        prediction_box = plot_bbox(
                            ground_truth,
                            bbox,
                            scores=score,
                            labels=id,
                            thresh=plot_class_thresh,
                            reverse_rgb=False,
                            class_names=valid_dataset.classes,
                            absolute_coordinates=True)

                        # Tensorboard에 그리기 (height, width, channel) -> (channel, height, width) 를한다.
                        prediction_box = np.transpose(prediction_box,
                                                      axes=(2, 0, 1))
                        batch_image.append(
                            prediction_box)  # (batch, channel, height, width)

                summary.add_image(tag="valid_result",
                                  image=np.array(batch_image),
                                  global_step=i)
                summary.add_scalar(tag="conf_loss",
                                   value={
                                       "train_conf_loss": train_conf_loss_mean,
                                       "valid_conf_loss": valid_conf_loss_mean
                                   },
                                   global_step=i)
                summary.add_scalar(tag="loc_loss",
                                   value={
                                       "train_loc_loss": train_loc_loss_mean,
                                       "valid_loc_loss": valid_loc_loss_mean
                                   },
                                   global_step=i)
                summary.add_scalar(tag="total_loss",
                                   value={
                                       "train_total_loss":
                                       train_total_loss_mean,
                                       "valid_total_loss":
                                       valid_total_loss_mean
                                   },
                                   global_step=i)

                for p in net.collect_params().values():
                    summary.add_histogram(tag=p.name,
                                          values=p.data(ctx=ctx_list[0]),
                                          global_step=i,
                                          bins='default')

    end_time = time.time()
    learning_time = end_time - start_time
    logging.info(f"learning time : 약, {learning_time / 3600:0.2f}H")
    logging.info("optimization completed")

    if using_mlflow:
        ml.log_metric("learning time", round(learning_time / 3600, 2))
Exemple #20
0
    def __init__(self,
                 network,
                 layers,
                 num_filters,
                 anchor_sizes,
                 anchor_ratios,
                 steps,
                 dataset,
                 input_shape,
                 batch_size,
                 optimizer,
                 lr,
                 wd,
                 momentum,
                 epoch,
                 lr_decay,
                 train_split='train2017',
                 val_split='val2017',
                 use_amp=False,
                 gpus='0,1,2,3',
                 save_prefix='~/gluon_detector/output'):
        self.network = network
        self.layers = layers
        self.num_filters = num_filters
        self.anchor_sizes = list(zip(anchor_sizes[:-1], anchor_sizes[1:]))
        self.anchor_ratios = anchor_ratios
        self.steps = steps

        self.dataset = dataset

        if isinstance(input_shape, int):
            self.input_size = input_size
            self.input_shape = (input_shape, input_shape)
        elif isinstance(input_shape, (tuple, list)):
            self.input_shape = input_shape
            self.input_size = input_shape[0]
        else:
            raise TypeError('Expected input_shape to be either int or tuple, \
                but got {}'.format(type(input_shape)))
        self.width, self.height = self.input_shape

        self.batch_size = batch_size
        self.train_split = train_split
        self.val_split = val_split
        self.optimizer = optimizer
        self.lr = lr
        self.wd = wd
        self.momentum = momentum
        self.epoch = epoch
        self.lr_decay = lr_decay
        self.lr_decay_epoch = ','.join([str(l * epoch) for l in [0.6, 0.8]])

        self.use_amp = use_amp

        self.ctx = [mx.gpu(int(i)) for i in gpus.split(',') if i.strip()]

        self.save_prefix = save_prefix

        self.anchors = self.get_anchors()
        self.net = self.build_net()

        self.train_data, self.val_data = self.get_dataloader()

        self.eval_metric = self.get_eval_metric()

        prefix = 'ssd_{}_{}_{}x{}'.format(self.dataset, self.network,
                                          self.input_shape[0],
                                          self.input_shape[1])
        self.save_prefix = os.path.expanduser(os.path.join(
            save_prefix, prefix))

        self.get_logger()

        if self.use_amp:
            amp.init()

        self.save_frequent = 10

        logging.info('SSDSolver initialized')
Exemple #21
0
    def __init__(self, config, logger=None, reporter=None):
        super(MaskRCNNEstimator, self).__init__(config, logger, reporter)

        # fix seed for mxnet, numpy and python builtin random generator.
        gutils.random.seed(self._cfg.train.seed)

        if self._cfg.mask_rcnn.amp:
            amp.init()

        # training contexts
        if self._cfg.horovod:
            self.ctx = [mx.gpu(hvd.local_rank())]
        else:
            ctx = [mx.gpu(int(i)) for i in self._cfg.gpus]
            self.ctx = ctx if ctx else [mx.cpu()]

        # network
        kwargs = {}
        module_list = []
        if self._cfg.mask_rcnn.use_fpn:
            module_list.append('fpn')
        if self._cfg.mask_rcnn.norm_layer is not None:
            module_list.append(self._cfg.mask_rcnn.norm_layer)
            if self._cfg.mask_rcnn.norm_layer == 'bn':
                kwargs['num_devices'] = len(self.ctx)
        self.num_gpus = hvd.size() if self._cfg.horovod else len(self.ctx)
        net_name = '_'.join(('mask_rcnn', *module_list,
                             self._cfg.mask_rcnn.backbone, self._cfg.dataset))
        if self._cfg.mask_rcnn.custom_model:
            self._cfg.mask_rcnn.use_fpn = True
            net_name = '_'.join(('mask_rcnn_fpn', self._cfg.mask_rcnn.backbone,
                                 self._cfg.dataset))
            if self._cfg.mask_rcnn.norm_layer == 'bn':
                norm_layer = gluon.contrib.nn.SyncBatchNorm
                norm_kwargs = {'num_devices': len(self.ctx)}
                # sym_norm_layer = mx.sym.contrib.SyncBatchNorm
                sym_norm_kwargs = {'ndev': len(self.ctx)}
            elif self._cfg.mask_rcnn.norm_layer == 'gn':
                norm_layer = gluon.nn.GroupNorm
                norm_kwargs = {'groups': 8}
                # sym_norm_layer = mx.sym.GroupNorm
                sym_norm_kwargs = {'groups': 8}
            else:
                norm_layer = gluon.nn.BatchNorm
                norm_kwargs = None
                # sym_norm_layer = None
                sym_norm_kwargs = None
            if self._cfg.dataset == 'coco':
                classes = COCODetection.CLASSES
            else:
                # default to VOC
                classes = VOCDetection.CLASSES
            self.net = get_model(
                'custom_mask_rcnn_fpn',
                classes=classes,
                transfer=None,
                dataset=self._cfg.dataset,
                pretrained_base=self._cfg.train.pretrained_base,
                base_network_name=self._cfg.mask_rcnn.backbone,
                norm_layer=norm_layer,
                norm_kwargs=norm_kwargs,
                sym_norm_kwargs=sym_norm_kwargs,
                num_fpn_filters=self._cfg.mask_rcnn.num_fpn_filters,
                num_box_head_conv=self._cfg.mask_rcnn.num_box_head_conv,
                num_box_head_conv_filters=self._cfg.mask_rcnn.
                num_box_head_conv_filters,
                num_box_head_dense_filters=self._cfg.mask_rcnn.
                num_box_head_dense_filters,
                short=self._cfg.mask_rcnn.image_short,
                max_size=self._cfg.mask_rcnn.image_max_size,
                min_stage=2,
                max_stage=6,
                nms_thresh=self._cfg.mask_rcnn.nms_thresh,
                nms_topk=self._cfg.mask_rcnn.nms_topk,
                post_nms=self._cfg.mask_rcnn.post_nms,
                roi_mode=self._cfg.mask_rcnn.roi_mode,
                roi_size=self._cfg.mask_rcnn.roi_size,
                strides=self._cfg.mask_rcnn.strides,
                clip=self._cfg.mask_rcnn.clip,
                rpn_channel=self._cfg.mask_rcnn.rpn_channel,
                base_size=self._cfg.mask_rcnn.anchor_base_size,
                scales=self._cfg.mask_rcnn.anchor_scales,
                ratios=self._cfg.mask_rcnn.anchor_aspect_ratio,
                alloc_size=self._cfg.mask_rcnn.anchor_alloc_size,
                rpn_nms_thresh=self._cfg.mask_rcnn.rpn_nms_thresh,
                rpn_train_pre_nms=self._cfg.train.rpn_train_pre_nms,
                rpn_train_post_nms=self._cfg.train.rpn_train_post_nms,
                rpn_test_pre_nms=self._cfg.valid.rpn_test_pre_nms,
                rpn_test_post_nms=self._cfg.valid.rpn_test_post_nms,
                rpn_min_size=self._cfg.train.rpn_min_size,
                per_device_batch_size=self._cfg.train.batch_size //
                self.num_gpus,
                num_sample=self._cfg.train.rcnn_num_samples,
                pos_iou_thresh=self._cfg.train.rcnn_pos_iou_thresh,
                pos_ratio=self._cfg.train.rcnn_pos_ratio,
                max_num_gt=self._cfg.mask_rcnn.max_num_gt,
                target_roi_scale=self._cfg.mask_rcnn.target_roi_scale,
                num_fcn_convs=self._cfg.mask_rcnn.num_mask_head_convs)
        else:
            self.net = get_model(
                net_name,
                pretrained_base=True,
                per_device_batch_size=self._cfg.train.batch_size //
                self.num_gpus,
                **kwargs)
        self._cfg.save_prefix += net_name
        if self._cfg.resume.strip():
            self.net.load_parameters(self._cfg.resume.strip())
        else:
            for param in self.net.collect_params().values():
                if param._data is not None:
                    continue
                param.initialize()
        self.net.collect_params().reset_ctx(self.ctx)

        if self._cfg.mask_rcnn.amp:
            # Cast both weights and gradients to 'float16'
            self.net.cast('float16')
            # This layers doesn't support type 'float16'
            self.net.collect_params('.*batchnorm.*').setattr(
                'dtype', 'float32')
            self.net.collect_params(
                '.*normalizedperclassboxcenterencoder.*').setattr(
                    'dtype', 'float32')

        # set up logger
        logging.basicConfig()
        self._logger = logging.getLogger()
        self._logger.setLevel(logging.INFO)
        log_file_path = self._cfg.save_prefix + '_train.log'
        log_dir = os.path.dirname(log_file_path)
        if log_dir and not os.path.exists(log_dir):
            os.makedirs(log_dir)
        fh = logging.FileHandler(log_file_path)
        self._logger.addHandler(fh)
        if MPI is None and self._cfg.horovod:
            self._logger.warning(
                'mpi4py is not installed, validation result may be incorrect.')
        self._logger.info(self._cfg)

        self.rpn_cls_loss = mx.gluon.loss.SigmoidBinaryCrossEntropyLoss(
            from_sigmoid=False)
        self.rpn_box_loss = mx.gluon.loss.HuberLoss(
            rho=self._cfg.train.rpn_smoothl1_rho)  # == smoothl1
        self.rcnn_cls_loss = mx.gluon.loss.SoftmaxCrossEntropyLoss()
        self.rcnn_box_loss = mx.gluon.loss.HuberLoss(
            rho=self._cfg.train.rcnn_smoothl1_rho)  # == smoothl1
        self.rcnn_mask_loss = mx.gluon.loss.SigmoidBinaryCrossEntropyLoss(
            from_sigmoid=False)
        self.metrics = [
            mx.metric.Loss('RPN_Conf'),
            mx.metric.Loss('RPN_SmoothL1'),
            mx.metric.Loss('RCNN_CrossEntropy'),
            mx.metric.Loss('RCNN_SmoothL1'),
            mx.metric.Loss('RCNN_Mask')
        ]

        self.rpn_acc_metric = RPNAccMetric()
        self.rpn_bbox_metric = RPNL1LossMetric()
        self.rcnn_acc_metric = RCNNAccMetric()
        self.rcnn_bbox_metric = RCNNL1LossMetric()
        self.rcnn_mask_metric = MaskAccMetric()
        self.rcnn_fgmask_metric = MaskFGAccMetric()
        self.metrics2 = [
            self.rpn_acc_metric, self.rpn_bbox_metric, self.rcnn_acc_metric,
            self.rcnn_bbox_metric, self.rcnn_mask_metric,
            self.rcnn_fgmask_metric
        ]

        self.async_eval_processes = []
        self.best_map = [0]
        self.epoch = 0

        # training data
        self.train_dataset, self.val_dataset, self.eval_metric = _get_dataset(
            self._cfg.dataset, self._cfg)
        self.batch_size = self._cfg.train.batch_size // self.num_gpus \
            if self._cfg.horovod else self._cfg.train.batch_size
        self._train_data, self._val_data = _get_dataloader(
            self.net, self.train_dataset, self.val_dataset,
            MaskRCNNDefaultTrainTransform, MaskRCNNDefaultValTransform,
            self.batch_size, len(self.ctx), self._cfg)