Exemple #1
0
def train(opt, net, train_loader, criterion, trainer, batch_size, logger):
    """train model"""
    for epoch in range(opt.start_epoch, opt.epochs):
        loss_total_val = 0
        loss_loc_val = 0
        loss_cls_val = 0
        batch_time = time.time()
        for i, data in enumerate(train_loader):
            template, search, label_cls, label_loc, label_loc_weight = train_batch_fn(
                data, opt)
            cls_losses = []
            loc_losses = []
            total_losses = []
            with autograd.record():
                for j in range(len(opt.ctx)):
                    cls, loc = net(template[j], search[j])
                    label_cls_temp = label_cls[j].reshape(-1).asnumpy()
                    pos_index = np.argwhere(label_cls_temp == 1).reshape(-1)
                    neg_index = np.argwhere(label_cls_temp == 0).reshape(-1)
                    if len(pos_index):
                        pos_index = nd.array(pos_index, ctx=opt.ctx[j])
                    else:
                        pos_index = nd.array(np.array([]), ctx=opt.ctx[j])
                    if len(neg_index):
                        neg_index = nd.array(neg_index, ctx=opt.ctx[j])
                    else:
                        neg_index = nd.array(np.array([]), ctx=opt.ctx[j])
                    cls_loss, loc_loss = criterion(cls, loc, label_cls[j],
                                                   pos_index, neg_index,
                                                   label_loc[j],
                                                   label_loc_weight[j])
                    total_loss = opt.cls_weight * cls_loss + opt.loc_weight * loc_loss
                    cls_losses.append(cls_loss)
                    loc_losses.append(loc_loss)
                    total_losses.append(total_loss)

                mx.nd.waitall()
                if opt.use_amp:
                    with amp.scale_loss(total_losses, trainer) as scaled_loss:
                        autograd.backward(scaled_loss)
                else:
                    autograd.backward(total_losses)
            trainer.step(batch_size)
            loss_total_val += sum([l.mean().asscalar()
                                   for l in total_losses]) / len(total_losses)
            loss_loc_val += sum([l.mean().asscalar()
                                 for l in loc_losses]) / len(loc_losses)
            loss_cls_val += sum([l.mean().asscalar()
                                 for l in cls_losses]) / len(cls_losses)
            if i % (opt.log_interval) == 0:
                logger.info('Epoch %d iteration %04d/%04d: loc loss %.3f, cls loss %.3f, \
                             training loss %.3f, batch time %.3f'                                                                 % \
                            (epoch, i, len(train_loader), loss_loc_val/(i+1), loss_cls_val/(i+1),
                             loss_total_val/(i+1), time.time()-batch_time))
                batch_time = time.time()
            mx.nd.waitall()
        # save every epoch
        if opt.no_val:
            save_checkpoint(net, opt, epoch, False)
    def forward_backward(self, x):
        data, label, gt_mask, rpn_cls_targets, rpn_box_targets, rpn_box_masks = x
        with autograd.record():
            gt_label = label[:, :, 4:5]
            gt_box = label[:, :, :4]
            cls_pred, box_pred, mask_pred, roi, samples, matches, rpn_score, rpn_box, anchors = net(
                data, gt_box)
            # losses of rpn
            rpn_score = rpn_score.squeeze(axis=-1)
            num_rpn_pos = (rpn_cls_targets >= 0).sum()
            rpn_loss1 = self.rpn_cls_loss(rpn_score, rpn_cls_targets,
                                          rpn_cls_targets >= 0) * rpn_cls_targets.size / num_rpn_pos
            rpn_loss2 = self.rpn_box_loss(rpn_box, rpn_box_targets,
                                          rpn_box_masks) * rpn_box.size / num_rpn_pos
            # rpn overall loss, use sum rather than average
            rpn_loss = rpn_loss1 + rpn_loss2
            # generate targets for rcnn
            cls_targets, box_targets, box_masks = self.net.target_generator(roi, samples,
                                                                            matches, gt_label,
                                                                            gt_box)
            # losses of rcnn
            num_rcnn_pos = (cls_targets >= 0).sum()
            rcnn_loss1 = self.rcnn_cls_loss(cls_pred, cls_targets,
                                            cls_targets.expand_dims(-1) >= 0) * cls_targets.size / \
                         num_rcnn_pos
            rcnn_loss2 = self.rcnn_box_loss(box_pred, box_targets, box_masks) * box_pred.size / \
                         num_rcnn_pos
            rcnn_loss = rcnn_loss1 + rcnn_loss2

            # generate targets for mask
            mask_targets, mask_masks = self.net.mask_target(roi, gt_mask, matches, cls_targets)
            # loss of mask
            mask_loss = self.rcnn_mask_loss(mask_pred, mask_targets, mask_masks) * \
                        mask_targets.size / mask_masks.sum()

            # overall losses
            total_loss = rpn_loss.sum() + rcnn_loss.sum() + mask_loss.sum()

            rpn_loss1_metric = rpn_loss1.mean()
            rpn_loss2_metric = rpn_loss2.mean()
            rcnn_loss1_metric = rcnn_loss1.sum()
            rcnn_loss2_metric = rcnn_loss2.sum()
            mask_loss_metric = mask_loss.sum()
            rpn_acc_metric = [[rpn_cls_targets, rpn_cls_targets >= 0], [rpn_score]]
            rpn_l1_loss_metric = [[rpn_box_targets, rpn_box_masks], [rpn_box]]
            rcnn_acc_metric = [[cls_targets], [cls_pred]]
            rcnn_l1_loss_metric = [[box_targets, box_masks], [box_pred]]
            rcnn_mask_metric = [[mask_targets, mask_masks], [mask_pred]]
            rcnn_fgmask_metric = [[mask_targets, mask_masks], [mask_pred]]

            if args.amp:
                with amp.scale_loss(total_loss, self._optimizer) as scaled_losses:
                    autograd.backward(scaled_losses)
            else:
                total_loss.backward()

        return rpn_loss1_metric, rpn_loss2_metric, rcnn_loss1_metric, rcnn_loss2_metric, \
               mask_loss_metric, rpn_acc_metric, rpn_l1_loss_metric, rcnn_acc_metric, \
               rcnn_l1_loss_metric, rcnn_mask_metric, rcnn_fgmask_metric
    def forward_backward(self, x):
        data, label, rpn_cls_targets, rpn_box_targets, rpn_box_masks = x
        with autograd.record():
            gt_label = label[:, :, 4:5]
            gt_box = label[:, :, :4]
            cls_pred, box_pred, _, _, _Z, rpn_score, rpn_box, _, cls_targets, \
                box_targets, box_masks, _ = self.net(data, gt_box, gt_label)
            # losses of rpn
            rpn_score = rpn_score.squeeze(axis=-1)
            num_rpn_pos = (rpn_cls_targets >= 0).sum()
            rpn_loss1 = self.rpn_cls_loss(
                rpn_score, rpn_cls_targets,
                rpn_cls_targets >= 0) * rpn_cls_targets.size / num_rpn_pos
            rpn_loss2 = self.rpn_box_loss(
                rpn_box, rpn_box_targets,
                rpn_box_masks) * rpn_box.size / num_rpn_pos
            # rpn overall loss, use sum rather than average
            rpn_loss = rpn_loss1 + rpn_loss2
            # losses of rcnn
            num_rcnn_pos = (cls_targets >= 0).sum()
            rcnn_loss1 = self.rcnn_cls_loss(
                cls_pred, cls_targets, cls_targets.expand_dims(-1) >= 0) * cls_targets.size / \
                         num_rcnn_pos
            rcnn_loss2 = self.rcnn_box_loss(box_pred, box_targets, box_masks) * box_pred.size / \
                         num_rcnn_pos
            rcnn_loss = rcnn_loss1 + rcnn_loss2
            # overall losses
            total_loss = rpn_loss.sum() * self.mix_ratio + rcnn_loss.sum(
            ) * self.mix_ratio

            rpn_loss1_metric = rpn_loss1.mean() * self.mix_ratio
            rpn_loss2_metric = rpn_loss2.mean() * self.mix_ratio
            rcnn_loss1_metric = rcnn_loss1.mean() * self.mix_ratio
            rcnn_loss2_metric = rcnn_loss2.mean() * self.mix_ratio
            rpn_acc_metric = [[rpn_cls_targets, rpn_cls_targets >= 0],
                              [rpn_score]]
            rpn_l1_loss_metric = [[rpn_box_targets, rpn_box_masks], [rpn_box]]
            rcnn_acc_metric = [[cls_targets], [cls_pred]]
            rcnn_l1_loss_metric = [[box_targets, box_masks], [box_pred]]

            if self.amp_enabled:
                from mxnet.contrib import amp
                with amp.scale_loss(total_loss,
                                    self._optimizer) as scaled_losses:
                    autograd.backward(scaled_losses)
            else:
                total_loss.backward()

        return rpn_loss1_metric, rpn_loss2_metric, rcnn_loss1_metric, rcnn_loss2_metric, \
               rpn_acc_metric, rpn_l1_loss_metric, rcnn_acc_metric, rcnn_l1_loss_metric
Exemple #4
0
 def forward_backward(
         self, shard: Tuple) -> List[Tuple[mx.nd.NDArray, mx.nd.NDArray]]:
     """
     Applies forward-backward pass for a single shard of a batch (data-parallel training).
     """
     inputs, labels = shard
     with mx.autograd.record():
         outputs = self.model(*inputs)  # type: Dict[str, mx.nd.NDArray]
         loss_outputs = [
             loss_function(outputs, labels)
             for loss_function in self.loss_functions
         ]
         loss_values = (v for v, _ in loss_outputs)
         sum_losses = mx.nd.add_n(*loss_values)
         if self.using_amp:
             # AMP applies dynamic loss scaling to the losses (scale up) and
             # the Trainer (scale down).
             with amp.scale_loss(sum_losses, self.trainer) as scaled_loss:
                 mx.autograd.backward(scaled_loss)
         else:
             # backward on the sum of losses, weights are defined in the loss blocks themselves.
             sum_losses.backward()
     return loss_outputs
    def train_step(self, images, cls_targets, box_targets):
        # treat targets as parameters of loss
        # so they are internal tensor of the network
        # no problem of bulks accessing external inputs
        # which makes loss completely sync free
        # copy box_targets and cls_targets to parameters
        for param_name, param in self.train_net.collect_params().items():
            if "box_target" in param_name:
                param.set_data(box_targets)
            elif "cls_target" in param_name:
                param.set_data(cls_targets)

        with autograd.record():
            sum_loss = self.train_net(images)
            if self.precision == 'amp':
                with amp.scale_loss(sum_loss, self.trainer) as scaled_loss:
                    autograd.backward(scaled_loss)
            elif self.precision == 'fp16':
                scaled_loss = sum_loss * self.fp16_loss_scale
                autograd.backward(scaled_loss)
            else:
                autograd.backward(sum_loss)

        return sum_loss
Exemple #6
0
    def train(epochs, ctx):
        if isinstance(ctx, mx.Context):
            ctx = [ctx]

        if config.train_cfg.param_init:
            init_func = getattr(mx.init, config.train_cfg.init)
            net.initialize(init_func(), ctx=ctx, force_reinit=True)
        else:
            net.load_parameters(config.train_cfg.param_file, ctx=ctx)

        summary(net, stat_name, nd.uniform(
            shape=(1, 3, imgsize, imgsize), ctx=ctx[0]))
        # net = nn.HybridBlock()
        net.hybridize()

        root = config.dir_cfg.dataset
        train_data = gluon.data.DataLoader(
            gluon.data.vision.CIFAR10(
                root=root, train=True).transform_first(transform_train),
            batch_size=batch_size, shuffle=True, last_batch='discard', num_workers=num_workers)

        val_data = gluon.data.DataLoader(
            gluon.data.vision.CIFAR10(
                root=root, train=False).transform_first(transform_test),
            batch_size=batch_size, shuffle=False, num_workers=num_workers)

        trainer_arg = {'learning_rate': config.lr_cfg.lr,
                       'wd': config.lr_cfg.wd, 'lr_scheduler': lr_sch}
        extra_arg = eval(config.lr_cfg.extra_arg)
        trainer_arg.update(extra_arg)
        trainer = gluon.Trainer(net.collect_params(), optimizer, trainer_arg)
        if config.train_cfg.amp:
            amp.init_trainer(trainer)
        metric = mx.metric.Accuracy()
        train_metric = mx.metric.RMSE()
        loss_fn = gluon.loss.SoftmaxCrossEntropyLoss(
            sparse_label=False if config.data_cfg.mixup else True)
        train_history = TrainingHistory(['training-error', 'validation-error'])
        # acc_history = TrainingHistory(['training-acc', 'validation-acc'])
        loss_history = TrainingHistory(['training-loss', 'validation-loss'])

        iteration = 0

        best_val_score = 0

        # print('start training')
        sig_state.emit(1)
        sig_pgbar.emit(0)
        # signal.emit('Training')
        for epoch in range(epochs):
            tic = time.time()
            train_metric.reset()
            metric.reset()
            train_loss = 0
            num_batch = len(train_data)
            alpha = 1
            for i, batch in enumerate(train_data):
                if epoch == 0 and iteration == 1 and config.save_cfg.profiler:
                    profiler.set_state('run')
                    is_profiler_run = True
                if epoch == 0 and iteration == 1 and config.save_cfg.tensorboard:
                    sw.add_graph(net)
                lam = np.random.beta(alpha, alpha)
                if epoch >= epochs - 20 or not config.data_cfg.mixup:
                    lam = 1

                data_1 = gluon.utils.split_and_load(
                    batch[0], ctx_list=ctx, batch_axis=0)
                label_1 = gluon.utils.split_and_load(
                    batch[1], ctx_list=ctx, batch_axis=0)

                if not config.data_cfg.mixup:
                    data = data_1
                    label = label_1
                else:
                    data = [lam*X + (1-lam)*X[::-1] for X in data_1]
                    label = []
                    for Y in label_1:
                        y1 = label_transform(Y, classes)
                        y2 = label_transform(Y[::-1], classes)
                        label.append(lam*y1 + (1-lam)*y2)

                with ag.record():
                    output = [net(X) for X in data]
                    loss = [loss_fn(yhat, y) for yhat, y in zip(output, label)]
                if config.train_cfg.amp:
                    with ag.record():
                        with amp.scale_loss(loss, trainer) as scaled_loss:
                            ag.backward(scaled_loss)
                            # scaled_loss.backward()
                else:
                    for l in loss:
                        l.backward()
                trainer.step(batch_size)
                train_loss += sum([l.sum().asscalar() for l in loss])

                output_softmax = [nd.SoftmaxActivation(out) for out in output]
                train_metric.update(label, output_softmax)
                metric.update(label_1, output_softmax)
                name, acc = train_metric.get()
                if config.save_cfg.tensorboard:
                    sw.add_scalar(tag='lr', value=trainer.learning_rate,
                                  global_step=iteration)
                if epoch == 0 and iteration == 1 and config.save_cfg.profiler:
                    nd.waitall()
                    profiler.set_state('stop')
                    profiler.dump()
                iteration += 1
                sig_pgbar.emit(iteration)
                if check_flag()[0]:
                    sig_state.emit(2)
                while(check_flag()[0] or check_flag()[1]):
                    if check_flag()[1]:
                        print('stop')
                        return
                    else:
                        time.sleep(5)
                        print('pausing')

            epoch_time = time.time() - tic
            train_loss /= batch_size * num_batch
            name, acc = train_metric.get()
            _, train_acc = metric.get()
            name, val_acc, _ = test(ctx, val_data)
            # if config.data_cfg.mixup:
            #     train_history.update([acc, 1-val_acc])
            #     plt.cla()
            #     train_history.plot(save_path='%s/%s_history.png' %
            #                        (plot_name, model_name))
            # else:
            train_history.update([1-train_acc, 1-val_acc])
            plt.cla()
            train_history.plot(save_path='%s/%s_history.png' %
                               (plot_name, model_name))

            if val_acc > best_val_score:
                best_val_score = val_acc
                net.save_parameters('%s/%.4f-cifar-%s-%d-best.params' %
                                    (save_dir, best_val_score, model_name, epoch))

            current_lr = trainer.learning_rate
            name, val_acc, val_loss = test(ctx, val_data)

            logging.info('[Epoch %d] loss=%f train_acc=%f train_RMSE=%f\n     val_acc=%f val_loss=%f lr=%f time: %f' %
                         (epoch, train_loss, train_acc, acc, val_acc, val_loss, current_lr, epoch_time))
            loss_history.update([train_loss, val_loss])
            plt.cla()
            loss_history.plot(save_path='%s/%s_loss.png' %
                              (plot_name, model_name), y_lim=(0, 2), legend_loc='best')
            if config.save_cfg.tensorboard:
                sw._add_scalars(tag='Acc',
                                scalar_dict={'train_acc': train_acc, 'test_acc': val_acc}, global_step=epoch)
                sw._add_scalars(tag='Loss',
                                scalar_dict={'train_loss': train_loss, 'test_loss': val_loss}, global_step=epoch)

            sig_table.emit([epoch, train_loss, train_acc,
                            val_loss, val_acc, current_lr, epoch_time])
            csv_writer.writerow([epoch, train_loss, train_acc,
                                 val_loss, val_acc, current_lr, epoch_time])
            csv_file.flush()

            if save_period and save_dir and (epoch + 1) % save_period == 0:
                net.save_parameters('%s/cifar10-%s-%d.params' %
                                    (save_dir, model_name, epoch))
        if save_period and save_dir:
            net.save_parameters('%s/cifar10-%s-%d.params' %
                                (save_dir, model_name, epochs-1))
Exemple #7
0
def train(net, train_data, val_data, eval_metric, ctx, args):
    """Training pipeline"""
    net.collect_params().reset_ctx(ctx)
    if args.lr_decay_period > 0:
        lr_decay_epoch = list(
            range(args.lr_decay_period, args.epochs, args.lr_decay_period))
    else:
        lr_decay_epoch = [int(i) for i in args.lr_decay_epoch.split(',')]
    lr_decay_epoch = [e - args.warmup_epochs for e in lr_decay_epoch]
    num_batches = args.num_samples // args.batch_size
    lr_scheduler = LRSequential([
        LRScheduler('linear',
                    base_lr=0,
                    target_lr=args.lr,
                    nepochs=args.warmup_epochs,
                    iters_per_epoch=num_batches),
        LRScheduler(args.lr_mode,
                    base_lr=args.lr,
                    nepochs=args.epochs - args.warmup_epochs,
                    iters_per_epoch=num_batches,
                    step_epoch=lr_decay_epoch,
                    step_factor=args.lr_decay,
                    power=2),
    ])
    trainer = gluon.Trainer(net.collect_params(),
                            'sgd', {
                                'lr_scheduler': lr_scheduler,
                                'wd': args.wd,
                                'momentum': args.momentum
                            },
                            update_on_kvstore=(False if args.amp else None))

    if args.amp:
        amp.init_trainer(trainer)

    print("train_efficientdet.py-148 train classes=", classes, len(classes))
    cls_box_loss = EfficientDetLoss(len(classes) + 1, rho=0.1, lambd=50.0)
    ce_metric = mx.metric.Loss('FocalLoss')
    smoothl1_metric = mx.metric.Loss('SmoothL1')

    # set up logger
    logging.basicConfig()
    logger = logging.getLogger()
    logger.setLevel(logging.INFO)
    log_file_path = args.save_prefix + '_train.log'
    log_dir = os.path.dirname(log_file_path)
    if log_dir and not os.path.exists(log_dir):
        os.makedirs(log_dir)
    fh = logging.FileHandler(log_file_path)
    logger.addHandler(fh)
    logger.info(args)
    logger.info('Start training from [Epoch {}]'.format(args.start_epoch))
    best_map = [0]

    for epoch in range(args.start_epoch + 1, args.epochs + 1):
        logger.info("[Epoch {}] Set learning rate to {}".format(
            epoch, trainer.learning_rate))
        ce_metric.reset()
        smoothl1_metric.reset()
        tic = time.time()
        btic = time.time()
        net.hybridize()
        for i, batch in enumerate(train_data):
            data = gluon.utils.split_and_load(batch[0],
                                              ctx_list=ctx,
                                              batch_axis=0)
            cls_targets = gluon.utils.split_and_load(batch[1],
                                                     ctx_list=ctx,
                                                     batch_axis=0)
            box_targets = gluon.utils.split_and_load(batch[2],
                                                     ctx_list=ctx,
                                                     batch_axis=0)

            with autograd.record():
                cls_preds = []
                box_preds = []
                for x in data:
                    cls_pred, box_pred, _ = net(x)
                    cls_preds.append(cls_pred)
                    box_preds.append(box_pred)
                sum_loss, cls_loss, box_loss = cls_box_loss(
                    cls_preds, box_preds, cls_targets, box_targets)
                if args.amp:
                    with amp.scale_loss(sum_loss, trainer) as scaled_loss:
                        autograd.backward(scaled_loss)
                else:
                    autograd.backward(sum_loss)
            # since we have already normalized the loss, we don't want to normalize
            # by batch-size anymore
            trainer.step(1)

            local_batch_size = int(args.batch_size)
            ce_metric.update(0, [l * local_batch_size for l in cls_loss])
            smoothl1_metric.update(0, [l * local_batch_size for l in box_loss])
            if args.log_interval and not (i + 1) % args.log_interval:
                name1, loss1 = ce_metric.get()
                name2, loss2 = smoothl1_metric.get()
                logger.info(
                    '[Epoch {}][Batch {}], Speed: {:.3f} samples/sec, {}={:.3f}, {}={:.3f}'
                    .format(epoch, i, args.batch_size / (time.time() - btic),
                            name1, loss1, name2, loss2))
            btic = time.time()

        name1, loss1 = ce_metric.get()
        name2, loss2 = smoothl1_metric.get()
        logger.info(
            '[Epoch {}] Training cost: {:.3f}, {}={:.3f}, {}={:.3f}'.format(
                epoch, (time.time() - tic), name1, loss1, name2, loss2))
        if (epoch % args.val_interval
                == 0) or (args.save_interval
                          and epoch % args.save_interval == 0):
            # consider reduce the frequency of validation to save time
            map_name, mean_ap = validate(net, val_data, ctx, eval_metric)
            val_msg = '\n'.join(
                ['{}={}'.format(k, v) for k, v in zip(map_name, mean_ap)])
            logger.info('[Epoch {}] Validation: \n{}'.format(epoch, val_msg))
            current_map = float(mean_ap[-1])
        else:
            current_map = 0.
        save_params(net, best_map, current_map, epoch, args.save_interval,
                    args.save_prefix)
Exemple #8
0
def run(mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225],
        offset_alloc_size=(64, 64),
        anchors={"shallow": [(10, 13), (16, 30), (33, 23)],
                 "middle": [(30, 61), (62, 45), (59, 119)],
                 "deep": [(116, 90), (156, 198), (373, 326)]},
        graphviz=False,
        epoch=100,
        input_size=[416, 416],
        batch_log=100,
        batch_size=16,
        batch_interval=10,
        subdivision=4,
        train_dataset_path="Dataset/train",
        valid_dataset_path="Dataset/valid",
        multiscale=False,
        factor_scale=[13, 5],
        ignore_threshold=0.5,
        dynamic=False,
        data_augmentation=True,
        num_workers=4,
        optimizer="ADAM",
        save_period=5,
        load_period=10,
        learning_rate=0.001, decay_lr=0.999, decay_step=10,
        GPU_COUNT=0,
        Darknetlayer=53,
        pretrained_base=True,
        pretrained_path="modelparam",
        AMP=True,
        valid_size=8,
        eval_period=5,
        tensorboard=True,
        valid_graph_path="valid_Graph",
        using_mlflow=True,
        multiperclass=True,
        nms_thresh=0.5,
        nms_topk=500,
        iou_thresh=0.5,
        except_class_thresh=0.05,
        plot_class_thresh=0.5):
    if GPU_COUNT == 0:
        ctx = mx.cpu(0)
        AMP = False
    elif GPU_COUNT == 1:
        ctx = mx.gpu(0)
    else:
        ctx = [mx.gpu(i) for i in range(GPU_COUNT)]

    # 운영체제 확인
    if platform.system() == "Linux":
        logging.info(f"{platform.system()} OS")
    elif platform.system() == "Windows":
        logging.info(f"{platform.system()} OS")
    else:
        logging.info(f"{platform.system()} OS")

    if isinstance(ctx, (list, tuple)):
        for i, c in enumerate(ctx):
            free_memory, total_memory = mx.context.gpu_memory_info(i)
            free_memory = round(free_memory / (1024 * 1024 * 1024), 2)
            total_memory = round(total_memory / (1024 * 1024 * 1024), 2)
            logging.info(f'Running on {c} / free memory : {free_memory}GB / total memory {total_memory}GB')
    else:
        if GPU_COUNT == 1:
            free_memory, total_memory = mx.context.gpu_memory_info(0)
            free_memory = round(free_memory / (1024 * 1024 * 1024), 2)
            total_memory = round(total_memory / (1024 * 1024 * 1024), 2)
            logging.info(f'Running on {ctx} / free memory : {free_memory}GB / total memory {total_memory}GB')
        else:
            logging.info(f'Running on {ctx}')

    # 입력 사이즈를 32의 배수로 지정해 버리기 - stride가 일그러지는 것을 막기 위함
    if input_size[0] % 32 != 0 and input_size[1] % 32 != 0:
        logging.info("The input size must be a multiple of 32")
        exit(0)

    if GPU_COUNT > 0 and batch_size < GPU_COUNT:
        logging.info("batch size must be greater than gpu number")
        exit(0)

    if AMP:
        amp.init()

    if multiscale:
        logging.info("Using MultiScale")

    if data_augmentation:
        logging.info("Using Data Augmentation")

    logging.info("training YoloV3 Detector")
    input_shape = (1, 3) + tuple(input_size)

    try:
        net = Yolov3(Darknetlayer=Darknetlayer,
                     anchors=anchors,
                     pretrained=False,
                     ctx=mx.cpu())
        train_dataloader, train_dataset = traindataloader(multiscale=multiscale,
                                                          factor_scale=factor_scale,
                                                          augmentation=data_augmentation,
                                                          path=train_dataset_path,
                                                          input_size=input_size,
                                                          batch_size=batch_size,
                                                          batch_interval=batch_interval,
                                                          num_workers=num_workers,
                                                          shuffle=True, mean=mean, std=std,
                                                          net=net, ignore_threshold=ignore_threshold, dynamic=dynamic,
                                                          from_sigmoid=False, make_target=True)
        valid_dataloader, valid_dataset = validdataloader(path=valid_dataset_path,
                                                          input_size=input_size,
                                                          batch_size=valid_size,
                                                          num_workers=num_workers,
                                                          shuffle=True, mean=mean, std=std,
                                                          net=net, ignore_threshold=ignore_threshold, dynamic=dynamic,
                                                          from_sigmoid=False, make_target=True)

    except Exception:
        logging.info("dataset 없음")
        exit(0)

    train_update_number_per_epoch = len(train_dataloader)
    if train_update_number_per_epoch < 1:
        logging.warning("train batch size가 데이터 수보다 큼")
        exit(0)

    valid_list = glob.glob(os.path.join(valid_dataset_path, "*"))
    if valid_list:
        valid_update_number_per_epoch = len(valid_dataloader)
        if valid_update_number_per_epoch < 1:
            logging.warning("valid batch size가 데이터 수보다 큼")
            exit(0)

    num_classes = train_dataset.num_class  # 클래스 수
    name_classes = train_dataset.classes

    optimizer = optimizer.upper()
    if pretrained_base:
        model = str(input_size[0]) + "_" + str(input_size[1]) + "_" + optimizer + "_P" + "Dark_" + str(Darknetlayer)
    else:
        model = str(input_size[0]) + "_" + str(input_size[1]) + "_" + optimizer + "_Dark_" + str(Darknetlayer)

    weight_path = f"weights/{model}"
    sym_path = os.path.join(weight_path, f'{model}-symbol.json')
    param_path = os.path.join(weight_path, f'{model}-{load_period:04d}.params')

    if os.path.exists(param_path) and os.path.exists(sym_path):
        start_epoch = load_period
        logging.info(f"loading {os.path.basename(param_path)} weights\n")
        net = gluon.SymbolBlock.imports(sym_path,
                                        ['data'],
                                        param_path, ctx=ctx)
    else:
        start_epoch = 0
        '''
        mxnet c++에서 arbitrary input image 를 받기 위한 전략
        alloc_size : tuple of int, default is (128, 128)
        For advanced users. Define `alloc_size` to generate large enough offset
        maps, which will later saved in parameters. During inference, we support arbitrary
        input image by cropping corresponding area of the anchor map. This allow us
        to export to symbol so we can run it in c++, Scalar, etc.
        '''
        net = Yolov3(Darknetlayer=Darknetlayer,
                     input_size=input_size,
                     anchors=anchors,
                     num_classes=num_classes,  # foreground만
                     pretrained=pretrained_base,
                     pretrained_path=pretrained_path,
                     alloc_size=offset_alloc_size,
                     ctx=ctx)

        if isinstance(ctx, (list, tuple)):
            net.summary(mx.nd.ones(shape=input_shape, ctx=ctx[0]))
        else:
            net.summary(mx.nd.ones(shape=input_shape, ctx=ctx))

        '''
        active (bool, default True) – Whether to turn hybrid on or off.
        static_alloc (bool, default False) – Statically allocate memory to improve speed. Memory usage may increase.
        static_shape (bool, default False) – Optimize for invariant input shapes between iterations. Must also set static_alloc to True. Change of input shapes is still allowed but slower.
        '''
        if multiscale:
            net.hybridize(active=True, static_alloc=True, static_shape=False)
        else:
            net.hybridize(active=True, static_alloc=True, static_shape=True)

    if start_epoch + 1 >= epoch + 1:
        logging.info("this model has already been optimized")
        exit(0)

    if tensorboard:
        summary = SummaryWriter(logdir=os.path.join("mxboard", model), max_queue=10, flush_secs=10,
                                verbose=False)
        if isinstance(ctx, (list, tuple)):
            net.forward(mx.nd.ones(shape=input_shape, ctx=ctx[0]))
        else:
            net.forward(mx.nd.ones(shape=input_shape, ctx=ctx))
        summary.add_graph(net)
    if graphviz:
        gluoncv.utils.viz.plot_network(net, shape=input_shape, save_prefix=model)

    # optimizer
    unit = 1 if (len(train_dataset) // batch_size) < 1 else len(train_dataset) // batch_size
    step = unit * decay_step
    lr_sch = mx.lr_scheduler.FactorScheduler(step=step, factor=decay_lr, stop_factor_lr=1e-12, base_lr=learning_rate)

    for p in net.collect_params().values():
        if p.grad_req != "null":
            p.grad_req = 'add'

    if AMP:
        '''
        update_on_kvstore : bool, default None
        Whether to perform parameter updates on kvstore. If None, then trainer will choose the more
        suitable option depending on the type of kvstore. If the `update_on_kvstore` argument is
        provided, environment variable `MXNET_UPDATE_ON_KVSTORE` will be ignored.
        '''
        if optimizer.upper() == "ADAM":
            trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={"learning_rate": learning_rate,
                                                                                       "lr_scheduler": lr_sch,
                                                                                       "beta1": 0.9,
                                                                                       "beta2": 0.999,
                                                                                       'multi_precision': False},
                                    update_on_kvstore=False)  # for Dynamic loss scaling
        elif optimizer.upper() == "RMSPROP":
            trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={"learning_rate": learning_rate,
                                                                                       "lr_scheduler": lr_sch,
                                                                                       "gamma1": 0.9,
                                                                                       "gamma2": 0.999,
                                                                                       'multi_precision': False},
                                    update_on_kvstore=False)  # for Dynamic loss scaling
        elif optimizer.upper() == "SGD":
            trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={"learning_rate": learning_rate,
                                                                                       "lr_scheduler": lr_sch,
                                                                                       "wd": 0.0005,
                                                                                       "momentum": 0.9,
                                                                                       'multi_precision': False},
                                    update_on_kvstore=False)  # for Dynamic loss scaling
        else:
            logging.error("optimizer not selected")
            exit(0)

        amp.init_trainer(trainer)

    else:
        if optimizer.upper() == "ADAM":
            trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={"learning_rate": learning_rate,
                                                                                       "lr_scheduler": lr_sch,
                                                                                       "beta1": 0.9,
                                                                                       "beta2": 0.999,
                                                                                       'multi_precision': False})
        elif optimizer.upper() == "RMSPROP":
            trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={"learning_rate": learning_rate,
                                                                                       "lr_scheduler": lr_sch,
                                                                                       "gamma1": 0.9,
                                                                                       "gamma2": 0.999,
                                                                                       'multi_precision': False})
        elif optimizer.upper() == "SGD":
            trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={"learning_rate": learning_rate,
                                                                                       "lr_scheduler": lr_sch,
                                                                                       "wd": 0.0005,
                                                                                       "momentum": 0.9,
                                                                                       'multi_precision': False})

        else:
            logging.error("optimizer not selected")
            exit(0)

    loss = Yolov3Loss(sparse_label=True,
                      from_sigmoid=False,
                      batch_axis=None,
                      num_classes=num_classes,
                      reduction="sum",
                      exclude=False)

    prediction = Prediction(
        from_sigmoid=False,
        num_classes=num_classes,
        nms_thresh=nms_thresh,
        nms_topk=nms_topk,
        except_class_thresh=except_class_thresh,
        multiperclass=multiperclass)

    precision_recall = Voc_2007_AP(iou_thresh=iou_thresh, class_names=name_classes)

    start_time = time.time()
    for i in tqdm(range(start_epoch + 1, epoch + 1, 1), initial=start_epoch + 1, total=epoch):

        xcyc_loss_sum = 0
        wh_loss_sum = 0
        object_loss_sum = 0
        class_loss_sum = 0
        time_stamp = time.time()

        for batch_count, (image, _, xcyc_all, wh_all, objectness_all, class_all, weights_all, _) in enumerate(
                train_dataloader, start=1):
            td_batch_size = image.shape[0]

            image = mx.nd.split(data=image, num_outputs=subdivision, axis=0)
            xcyc_all = mx.nd.split(data=xcyc_all, num_outputs=subdivision, axis=0)
            wh_all = mx.nd.split(data=wh_all, num_outputs=subdivision, axis=0)
            objectness_all = mx.nd.split(data=objectness_all, num_outputs=subdivision, axis=0)
            class_all = mx.nd.split(data=class_all, num_outputs=subdivision, axis=0)
            weights_all = mx.nd.split(data=weights_all, num_outputs=subdivision, axis=0)

            if subdivision == 1:
                image = [image]
                xcyc_all = [xcyc_all]
                wh_all = [wh_all]
                objectness_all = [objectness_all]
                class_all = [class_all]
                weights_all = [weights_all]
            '''
            autograd 설명
            https://mxnet.apache.org/api/python/docs/tutorials/getting-started/crash-course/3-autograd.html
            '''
            with autograd.record(train_mode=True):

                xcyc_all_losses = []
                wh_all_losses = []
                object_all_losses = []
                class_all_losses = []

                for image_split, xcyc_split, wh_split, objectness_split, class_split, weights_split in zip(image,
                                                                                                           xcyc_all,
                                                                                                           wh_all,
                                                                                                           objectness_all,
                                                                                                           class_all,
                                                                                                           weights_all):

                    if GPU_COUNT <= 1:
                        image_split = gluon.utils.split_and_load(image_split, [ctx], even_split=False)
                        xcyc_split = gluon.utils.split_and_load(xcyc_split, [ctx], even_split=False)
                        wh_split = gluon.utils.split_and_load(wh_split, [ctx], even_split=False)
                        objectness_split = gluon.utils.split_and_load(objectness_split, [ctx], even_split=False)
                        class_split = gluon.utils.split_and_load(class_split, [ctx], even_split=False)
                        weights_split = gluon.utils.split_and_load(weights_split, [ctx], even_split=False)
                    else:
                        image_split = gluon.utils.split_and_load(image_split, ctx, even_split=False)
                        xcyc_split = gluon.utils.split_and_load(xcyc_split, ctx, even_split=False)
                        wh_split = gluon.utils.split_and_load(wh_split, ctx, even_split=False)
                        objectness_split = gluon.utils.split_and_load(objectness_split, ctx, even_split=False)
                        class_split = gluon.utils.split_and_load(class_split, ctx, even_split=False)
                        weights_split = gluon.utils.split_and_load(weights_split, ctx, even_split=False)

                    xcyc_losses = []
                    wh_losses = []
                    object_losses = []
                    class_losses = []
                    total_loss = []

                    # gpu N 개를 대비한 코드 (Data Parallelism)
                    for img, xcyc_target, wh_target, objectness, class_target, weights in zip(image_split, xcyc_split,
                                                                                              wh_split,
                                                                                              objectness_split,
                                                                                              class_split,
                                                                                              weights_split):
                        output1, output2, output3, anchor1, anchor2, anchor3, offset1, offset2, offset3, stride1, stride2, stride3 = net(
                            img)
                        xcyc_loss, wh_loss, object_loss, class_loss = loss(output1, output2, output3, xcyc_target,
                                                                           wh_target, objectness,
                                                                           class_target, weights)
                        xcyc_losses.append(xcyc_loss.asscalar())
                        wh_losses.append(wh_loss.asscalar())
                        object_losses.append(object_loss.asscalar())
                        class_losses.append(class_loss.asscalar())
                        total_loss.append(xcyc_loss + wh_loss + object_loss + class_loss)
                    if AMP:
                        with amp.scale_loss(total_loss, trainer) as scaled_loss:
                            autograd.backward(scaled_loss)
                    else:
                        autograd.backward(total_loss)

                    xcyc_all_losses.append(sum(xcyc_losses))
                    wh_all_losses.append(sum(wh_losses))
                    object_all_losses.append(sum(object_losses))
                    class_all_losses.append(sum(class_losses))

            trainer.step(batch_size=td_batch_size, ignore_stale_grad=False)
            # 비우기
            for p in net.collect_params().values():
                p.zero_grad()

            xcyc_loss_sum += sum(xcyc_all_losses) / td_batch_size
            wh_loss_sum += sum(wh_all_losses) / td_batch_size
            object_loss_sum += sum(object_all_losses) / td_batch_size
            class_loss_sum += sum(class_all_losses) / td_batch_size

            if batch_count % batch_log == 0:
                logging.info(f'[Epoch {i}][Batch {batch_count}/{train_update_number_per_epoch}],'
                             f'[Speed {td_batch_size / (time.time() - time_stamp):.3f} samples/sec],'
                             f'[Lr = {trainer.learning_rate}]'
                             f'[xcyc loss = {sum(xcyc_all_losses) / td_batch_size:.3f}]'
                             f'[wh loss = {sum(wh_all_losses) / td_batch_size:.3f}]'
                             f'[obj loss = {sum(object_all_losses) / td_batch_size:.3f}]'
                             f'[class loss = {sum(class_all_losses) / td_batch_size:.3f}]')
            time_stamp = time.time()

        train_xcyc_loss_mean = np.divide(xcyc_loss_sum, train_update_number_per_epoch)
        train_wh_loss_mean = np.divide(wh_loss_sum, train_update_number_per_epoch)
        train_object_loss_mean = np.divide(object_loss_sum, train_update_number_per_epoch)
        train_class_loss_mean = np.divide(class_loss_sum, train_update_number_per_epoch)
        train_total_loss_mean = train_xcyc_loss_mean + train_wh_loss_mean + train_object_loss_mean + train_class_loss_mean
        logging.info(
            f"train xcyc loss : {train_xcyc_loss_mean} / "
            f"train wh loss : {train_wh_loss_mean} / "
            f"train object loss : {train_object_loss_mean} / "
            f"train class loss : {train_class_loss_mean} / "
            f"train total loss : {train_total_loss_mean}"
        )

        if i % eval_period == 0 and valid_list:

            xcyc_loss_sum = 0
            wh_loss_sum = 0
            object_loss_sum = 0
            class_loss_sum = 0

            # loss 구하기
            for image, label, xcyc_all, wh_all, objectness_all, class_all, weights_all, _ in valid_dataloader:
                vd_batch_size, _, height, width = image.shape

                if GPU_COUNT <= 1:
                    image = gluon.utils.split_and_load(image, [ctx], even_split=False)
                    label = gluon.utils.split_and_load(label, [ctx], even_split=False)
                    xcyc_all = gluon.utils.split_and_load(xcyc_all, [ctx], even_split=False)
                    wh_all = gluon.utils.split_and_load(wh_all, [ctx], even_split=False)
                    objectness_all = gluon.utils.split_and_load(objectness_all, [ctx], even_split=False)
                    class_all = gluon.utils.split_and_load(class_all, [ctx], even_split=False)
                    weights_all = gluon.utils.split_and_load(weights_all, [ctx], even_split=False)
                else:
                    image = gluon.utils.split_and_load(image, ctx, even_split=False)
                    label = gluon.utils.split_and_load(label, ctx, even_split=False)
                    xcyc_all = gluon.utils.split_and_load(xcyc_all, ctx, even_split=False)
                    wh_all = gluon.utils.split_and_load(wh_all, ctx, even_split=False)
                    objectness_all = gluon.utils.split_and_load(objectness_all, ctx, even_split=False)
                    class_all = gluon.utils.split_and_load(class_all, ctx, even_split=False)
                    weights_all = gluon.utils.split_and_load(weights_all, ctx, even_split=False)

                xcyc_losses = []
                wh_losses = []
                object_losses = []
                class_losses = []
                total_loss = []

                # gpu N 개를 대비한 코드 (Data Parallelism)
                for img, lb, xcyc_target, wh_target, objectness, class_target, weights in zip(image, label, xcyc_all,
                                                                                              wh_all, objectness_all,
                                                                                              class_all, weights_all):
                    gt_box = lb[:, :, :4]
                    gt_id = lb[:, :, 4:5]

                    output1, output2, output3, anchor1, anchor2, anchor3, offset1, offset2, offset3, stride1, stride2, stride3 = net(
                        img)
                    id, score, bbox = prediction(output1, output2, output3, anchor1, anchor2, anchor3, offset1, offset2,
                                                 offset3, stride1, stride2, stride3)

                    precision_recall.update(pred_bboxes=bbox,
                                            pred_labels=id,
                                            pred_scores=score,
                                            gt_boxes=gt_box,
                                            gt_labels=gt_id)

                    xcyc_loss, wh_loss, object_loss, class_loss = loss(output1, output2, output3, xcyc_target,
                                                                       wh_target, objectness,
                                                                       class_target, weights)
                    xcyc_losses.append(xcyc_loss.asscalar())
                    wh_losses.append(wh_loss.asscalar())
                    object_losses.append(object_loss.asscalar())
                    class_losses.append(class_loss.asscalar())
                    total_loss.append(xcyc_losses + wh_losses + object_losses + class_losses)

                xcyc_loss_sum += sum(xcyc_losses) / vd_batch_size
                wh_loss_sum += sum(wh_losses) / vd_batch_size
                object_loss_sum += sum(object_losses) / vd_batch_size
                class_loss_sum += sum(class_losses) / vd_batch_size

            valid_xcyc_loss_mean = np.divide(xcyc_loss_sum, valid_update_number_per_epoch)
            valid_wh_loss_mean = np.divide(wh_loss_sum, valid_update_number_per_epoch)
            valid_object_loss_mean = np.divide(object_loss_sum, valid_update_number_per_epoch)
            valid_class_loss_mean = np.divide(class_loss_sum, valid_update_number_per_epoch)
            valid_total_loss_mean = valid_xcyc_loss_mean + valid_wh_loss_mean + valid_object_loss_mean + valid_class_loss_mean

            logging.info(
                f"valid xcyc loss : {valid_xcyc_loss_mean} / "
                f"valid wh loss : {valid_wh_loss_mean} / "
                f"valid object loss : {valid_object_loss_mean} / "
                f"valid class loss : {valid_class_loss_mean} / "
                f"valid total loss : {valid_total_loss_mean}"
            )

            AP_appender = []
            round_position = 2
            class_name, precision, recall, true_positive, false_positive, threshold = precision_recall.get_PR_list()
            for j, c, p, r in zip(range(len(recall)), class_name, precision, recall):
                name, AP = precision_recall.get_AP(c, p, r)
                logging.info(f"class {j}'s {name} AP : {round(AP * 100, round_position)}%")
                AP_appender.append(AP)
            mAP_result = np.mean(AP_appender)

            logging.info(f"mAP : {round(mAP_result * 100, round_position)}%")
            precision_recall.get_PR_curve(name=class_name,
                                          precision=precision,
                                          recall=recall,
                                          threshold=threshold,
                                          AP=AP_appender, mAP=mAP_result, folder_name=valid_graph_path, epoch=i)
            precision_recall.reset()

            if tensorboard:
                # gpu N 개를 대비한 코드 (Data Parallelism)
                dataloader_iter = iter(valid_dataloader)
                image, label, _, _, _, _, _, _ = next(dataloader_iter)
                if GPU_COUNT <= 1:
                    image = gluon.utils.split_and_load(image, [ctx], even_split=False)
                    label = gluon.utils.split_and_load(label, [ctx], even_split=False)
                else:
                    image = gluon.utils.split_and_load(image, ctx, even_split=False)
                    label = gluon.utils.split_and_load(label, ctx, even_split=False)

                ground_truth_colors = {}
                for k in range(num_classes):
                    ground_truth_colors[k] = (0, 0, 1)

                batch_image = []
                for img, lb in zip(image, label):
                    gt_boxes = lb[:, :, :4]
                    gt_ids = lb[:, :, 4:5]
                    output1, output2, output3, anchor1, anchor2, anchor3, offset1, offset2, offset3, stride1, stride2, stride3 = net(
                        img)
                    ids, scores, bboxes = prediction(output1, output2, output3, anchor1, anchor2, anchor3, offset1,
                                                     offset2, offset3, stride1, stride2, stride3)

                    for ig, gt_id, gt_box, id, score, bbox in zip(img, gt_ids, gt_boxes, ids, scores, bboxes):
                        ig = ig.transpose(
                            (1, 2, 0)) * mx.nd.array(std, ctx=ig.context) + mx.nd.array(mean, ctx=ig.context)
                        ig = (ig * 255).clip(0, 255)

                        # ground truth box 그리기
                        ground_truth = plot_bbox(ig, gt_box, scores=None, labels=gt_id, thresh=None,
                                                 reverse_rgb=True,
                                                 class_names=valid_dataset.classes, absolute_coordinates=True,
                                                 colors=ground_truth_colors)
                        # prediction box 그리기
                        prediction_box = plot_bbox(ground_truth, bbox, scores=score, labels=id,
                                                   thresh=plot_class_thresh,
                                                   reverse_rgb=False,
                                                   class_names=valid_dataset.classes, absolute_coordinates=True)

                        # Tensorboard에 그리기 위해 BGR -> RGB / (height, width, channel) -> (channel, height, width) 를한다.
                        prediction_box = cv2.cvtColor(prediction_box, cv2.COLOR_BGR2RGB)
                        prediction_box = np.transpose(prediction_box,
                                                      axes=(2, 0, 1))
                        batch_image.append(prediction_box)  # (batch, channel, height, width)

                summary.add_image(tag="valid_result", image=np.array(batch_image), global_step=i)

                summary.add_scalar(tag="xy_loss", value={"train_xcyc_loss": train_xcyc_loss_mean,
                                                         "valid_xcyc_loss": valid_xcyc_loss_mean}, global_step=i)
                summary.add_scalar(tag="wh_loss", value={"train_wh_loss": train_wh_loss_mean,
                                                         "valid_wh_loss": valid_wh_loss_mean}, global_step=i)
                summary.add_scalar(tag="object_loss", value={"train_object_loss": train_object_loss_mean,
                                                             "valid_object_loss": valid_object_loss_mean},
                                   global_step=i)
                summary.add_scalar(tag="class_loss", value={"train_class_loss": train_class_loss_mean,
                                                            "valid_class_loss": valid_class_loss_mean}, global_step=i)

                summary.add_scalar(tag="total_loss", value={
                    "train_total_loss": train_total_loss_mean,
                    "valid_total_loss": valid_total_loss_mean},
                                   global_step=i)

                params = net.collect_params().values()
                if GPU_COUNT > 1:
                    for c in ctx:
                        for p in params:
                            summary.add_histogram(tag=p.name, values=p.data(ctx=c), global_step=i, bins='default')
                else:
                    for p in params:
                        summary.add_histogram(tag=p.name, values=p.data(), global_step=i, bins='default')

        if i % save_period == 0:

            weight_epoch_path = os.path.join(weight_path, str(i))
            if not os.path.exists(weight_epoch_path):
                os.makedirs(weight_epoch_path)

            '''
            Hybrid models can be serialized as JSON files using the export function
            Export HybridBlock to json format that can be loaded by SymbolBlock.imports, mxnet.mod.Module or the C++ interface.
            When there are only one input, it will have name data. When there Are more than one inputs, they will be named as data0, data1, etc.
            '''

            if GPU_COUNT >= 1:
                context = mx.gpu(0)
            else:
                context = mx.cpu(0)

            postnet = PostNet(net=net, auxnet=prediction)

            try:
                net.export(os.path.join(weight_path, f"{model}"), epoch=i, remove_amp_cast=True)  # for onnx
                net.save_parameters(os.path.join(weight_path, f"{i}.params"))  # onnx 추출용
                # network inference, decoder, nms까지 처리됨 - mxnet c++에서 편리함 / onnx로는 추출 못함.
                export_block_for_cplusplus(path=os.path.join(weight_epoch_path, f"{model}_prepost"),
                                           block=postnet,
                                           data_shape=tuple(input_size) + tuple((3,)),
                                           epoch=i,
                                           preprocess=True,  # c++ 에서 inference시 opencv에서 읽은 이미지 그대로 넣으면 됨
                                           layout='HWC',
                                           ctx=context,
                                           remove_amp_cast=True)

            except Exception as E:
                logging.error(f"json, param model export 예외 발생 : {E}")
            else:
                logging.info("json, param model export 성공")
                net.collect_params().reset_ctx(ctx)

    end_time = time.time()
    learning_time = end_time - start_time
    logging.info(f"learning time : 약, {learning_time / 3600:0.2f}H")
    logging.info("optimization completed")

    if using_mlflow:
        ml.log_metric("learning time", round(learning_time / 3600, 2))
Exemple #9
0
    # Train
    for iBatch, (data, label, weight) in enumerate(batchDataSet_trn):

        data = data.as_in_context(context).astype("float32")
        label = label.as_in_context(context).astype("float32")
        weight = weight.as_in_context(context).astype("float32")

        # forward + backward
        with mxnet.autograd.record():

            #print mxnet.autograd.is_training()

            output = neuralNetwork(data)
            loss = lossFunction(output, label, weight)

            with amp.scale_loss(loss, trainer) as scaled_loss:

                mxnet.autograd.backward(scaled_loss)

        #loss.backward()

        # update parameters
        trainer.step(batchSize)

    # Compute some stuff
    for iBatch, (data, label, weight) in enumerate(batchDataSet_trn):

        data = data.as_in_context(context).astype("float32")
        label = label.as_in_context(context).astype("float32")
        weight = weight.as_in_context(context).astype("float32")
Exemple #10
0
def train_text_classification(args, reporter=None):
    # Step 1: add scripts every function and python objects in the original training script except for the training function
    # at the beginning of the decorated function
    nlp = try_import_gluonnlp()
    logger = logging.getLogger(__name__)
    if args.verbose:
        logger.setLevel(logging.INFO)
        logger.info(args)
    batch_size = args.batch_size
    dev_batch_size = args.dev_batch_size
    lr = args.lr
    epsilon = args.epsilon
    accumulate = args.accumulate
    log_interval = args.log_interval * accumulate if accumulate else args.log_interval
    if accumulate:
        logger.info('Using gradient accumulation. Effective batch size = ' \
                     'batch_size * accumulate = %d', accumulate * batch_size)

    # random seed
    np.random.seed(args.seed)
    random.seed(args.seed)
    mx.random.seed(args.seed)

    # TODO support for multi-GPU
    ctx = [mx.gpu(i) for i in range(args.num_gpus)
           ][0] if args.num_gpus > 0 else [mx.cpu()][0]

    task = args.dataset
    # data type with mixed precision training
    if args.dtype == 'float16':
        try:
            from mxnet.contrib import amp  # pylint: disable=ungrouped-imports
            # monkey patch amp list since topk does not support fp16
            amp.lists.symbol.FP32_FUNCS.append('topk')
            amp.lists.symbol.FP16_FP32_FUNCS.remove('topk')
            amp.init()
        except ValueError:
            # topk is already in the FP32_FUNCS list
            amp.init()
        except ImportError:
            # amp is not available
            logger.info(
                'Mixed precision training with float16 requires MXNet >= '
                '1.5.0b20190627. Please consider upgrading your MXNet version.'
            )
            exit()

    # model and loss
    model_name = args.net
    dataset = args.pretrained_dataset

    use_roberta = 'roberta' in model_name
    get_model_params = {
        'name': model_name,
        'dataset_name': dataset,
        'pretrained': True,
        'ctx': ctx,
        'use_decoder': False,
        'use_classifier': False,
    }
    # RoBERTa does not contain parameters for sentence pair classification
    if not use_roberta:
        get_model_params['use_pooler'] = True

    bert, vocabulary = nlp.model.get_model(**get_model_params)
    model = get_network(bert, task.class_labels, use_roberta)
    #do_regression = not task.class_labels
    #if do_regression:
    #    num_classes = 1
    #    loss_function = gluon.loss.L2Loss()
    #else:
    #    num_classes = len(task.class_labels)
    #    loss_function = gluon.loss.SoftmaxCELoss()
    ## reuse the BERTClassifier class with num_classes=1 for regression
    #if use_roberta:
    #    model = RoBERTaClassifier(bert, dropout=0.0, num_classes=num_classes)
    #else:
    #    model = BERTClassifier(bert, dropout=0.1, num_classes=num_classes)
    # initialize classifier
    loss_function = gluon.loss.SoftmaxCELoss(
    ) if task.class_labels else gluon.loss.L2Loss()
    initializer = mx.init.Normal(0.02)
    model.classifier.initialize(init=initializer, ctx=ctx)

    model.hybridize(static_alloc=True)
    loss_function.hybridize(static_alloc=True)

    # data processing
    do_lower_case = 'uncased' in dataset
    if use_roberta:
        bert_tokenizer = nlp.data.GPT2BPETokenizer()
    else:
        bert_tokenizer = nlp.data.BERTTokenizer(vocabulary,
                                                lower=do_lower_case)

    # Get the loader.
    train_data, dev_data_list, num_train_examples, trans, test_trans = preprocess_data(
        bert_tokenizer, task, batch_size, dev_batch_size, args.max_len,
        vocabulary, True, args.num_workers)

    def log_train(batch_id, batch_num, metric, step_loss, log_interval,
                  epoch_id, learning_rate, tbar):
        """Generate and print out the log message for training. """
        metric_nm, metric_val = metric.get()
        if not isinstance(metric_nm, list):
            metric_nm, metric_val = [metric_nm], [metric_val]

        train_str = '[Epoch %d] loss=%.4f, lr=%.7f, metrics:' + \
                    ','.join([i + ':%.4f' for i in metric_nm])
        tbar.set_description(
            train_str %
            (epoch_id, step_loss / log_interval, learning_rate, *metric_val))

    def log_eval(batch_id, batch_num, metric, step_loss, log_interval, tbar):
        """Generate and print out the log message for inference. """
        metric_nm, metric_val = metric.get()
        if not isinstance(metric_nm, list):
            metric_nm, metric_val = [metric_nm], [metric_val]

        eval_str = 'loss=%.4f, metrics:' + \
                   ','.join([i + ':%.4f' for i in metric_nm])
        tbar.set_description(eval_str %
                             (step_loss / log_interval, *metric_val))

    def evaluate(loader_dev, metric, segment):
        """Evaluate the model on validation dataset."""
        metric.reset()
        step_loss = 0
        tbar = tqdm(loader_dev)
        for batch_id, seqs in enumerate(tbar):
            input_ids, valid_length, segment_ids, label = seqs
            input_ids = input_ids.as_in_context(ctx)
            valid_length = valid_length.as_in_context(ctx).astype('float32')
            label = label.as_in_context(ctx)
            if use_roberta:
                out = model(input_ids, valid_length)
            else:
                out = model(input_ids, segment_ids.as_in_context(ctx),
                            valid_length)
            ls = loss_function(out, label).mean()

            step_loss += ls.asscalar()
            metric.update([label], [out])

            if (batch_id + 1) % (args.log_interval) == 0:
                log_eval(batch_id, len(loader_dev), metric, step_loss,
                         args.log_interval, tbar)
                step_loss = 0

        metric_nm, metric_val = metric.get()
        if not isinstance(metric_nm, list):
            metric_nm, metric_val = [metric_nm], [metric_val]
        metric_str = 'validation metrics:' + ','.join(
            [i + ':%.4f' for i in metric_nm])
        logger.info(metric_str, *metric_val)

        mx.nd.waitall()
        return metric_nm, metric_val

    # Step 2: the training function in the original training script is added in the decorated function in autogluon for training.
    """Training function."""

    all_model_params = model.collect_params()
    optimizer_params = {'learning_rate': lr, 'epsilon': epsilon, 'wd': 0.01}
    trainer = gluon.Trainer(all_model_params,
                            'bertadam',
                            optimizer_params,
                            update_on_kvstore=False)
    if args.dtype == 'float16':
        amp.init_trainer(trainer)

    step_size = batch_size * accumulate if accumulate else batch_size
    num_train_steps = int(num_train_examples / step_size * args.epochs)
    warmup_ratio = args.warmup_ratio
    num_warmup_steps = int(num_train_steps * warmup_ratio)
    step_num = 0

    # Do not apply weight decay on LayerNorm and bias terms
    for _, v in model.collect_params('.*beta|.*gamma|.*bias').items():
        v.wd_mult = 0.0
    # Collect differentiable parameters
    params = [p for p in all_model_params.values() if p.grad_req != 'null']

    # Set grad_req if gradient accumulation is required
    if accumulate and accumulate > 1:
        for p in params:
            p.grad_req = 'add'
    # track best eval score
    metric_history = []
    best_metric = None
    patience = args.early_stop

    tic = time.time()
    for epoch_id in range(args.epochs):
        if args.early_stop and patience == 0:
            logger.info('Early stopping at epoch %d', epoch_id)
            break
        task.metric.reset()
        step_loss = 0
        tic = time.time()
        all_model_params.zero_grad()

        tbar = tqdm(train_data)
        for batch_id, seqs in enumerate(tbar):
            # learning rate schedule
            if step_num < num_warmup_steps:
                new_lr = lr * step_num / num_warmup_steps
            else:
                non_warmup_steps = step_num - num_warmup_steps
                offset = non_warmup_steps / (num_train_steps -
                                             num_warmup_steps)
                new_lr = lr - offset * lr
            trainer.set_learning_rate(new_lr)

            # forward and backward
            with mx.autograd.record():
                input_ids, valid_length, segment_ids, label = seqs
                input_ids = input_ids.as_in_context(ctx)
                valid_length = valid_length.as_in_context(ctx).astype(
                    'float32')
                label = label.as_in_context(ctx)
                if use_roberta:
                    out = model(input_ids, valid_length)
                else:
                    out = model(input_ids, segment_ids.as_in_context(ctx),
                                valid_length)
                ls = loss_function(out, label).mean()
                if args.dtype == 'float16':
                    with amp.scale_loss(ls, trainer) as scaled_loss:
                        mx.autograd.backward(scaled_loss)
                else:
                    ls.backward()

            # update
            if not accumulate or (batch_id + 1) % accumulate == 0:
                trainer.allreduce_grads()
                nlp.utils.clip_grad_global_norm(params, 1)
                trainer.update(accumulate if accumulate else 1)
                step_num += 1
                if accumulate and accumulate > 1:
                    # set grad to zero for gradient accumulation
                    all_model_params.zero_grad()

            step_loss += ls.asscalar()
            task.metric.update([label], [out])
            if (batch_id + 1) % (args.log_interval) == 0:
                log_train(batch_id, len(train_data), task.metric, step_loss,
                          args.log_interval, epoch_id, trainer.learning_rate,
                          tbar)
                step_loss = 0
        mx.nd.waitall()

        # inference on dev data
        for segment, dev_data in dev_data_list:
            metric_nm, metric_val = evaluate(dev_data, task.metric, segment)
            if best_metric is None or metric_val >= best_metric:
                best_metric = metric_val
                patience = args.early_stop
            else:
                if args.early_stop is not None:
                    patience -= 1
            metric_history.append((epoch_id, metric_nm, metric_val))

        if reporter is not None:
            # Note: epoch reported back must start with 1, not with 0
            reporter(epoch=epoch_id + 1, accuracy=metric_val[0])

    if args.final_fit:
        get_model_params.pop('ctx')
        return {
            'model_params': collect_params(model),
            'get_model_args': get_model_params,
            'class_labels': task.class_labels,
            'transform': trans,
            'test_transform': test_trans
        }
    def train(ctx):
        if isinstance(ctx, mx.Context):
            ctx = [ctx]
        if opt.resume_params == '':
            import warnings
            with warnings.catch_warnings(record=True) as w:
                net.initialize(mx.init.MSRAPrelu(), ctx=ctx)

        if opt.no_wd:
            for k, v in net.collect_params('.*beta|.*gamma|.*bias').items():
                v.wd_mult = 0.0

        if accumulate > 1:
            logger.info(f'accumulate: {accumulate}, using "add" grad_req')
            import warnings
            with warnings.catch_warnings(record=True) as w:
                net.collect_params().setattr('grad_req', 'add')

        trainer = gluon.Trainer(net.collect_params(),
                                optimizer,
                                optimizer_params,
                                update_on_kvstore=False if opt.amp else None)
        if opt.amp:
            amp.init_trainer(trainer)
        if opt.resume_states != '':
            trainer.load_states(opt.resume_states)

        if opt.label_smoothing or opt.mixup:
            sparse_label_loss = False
        else:
            sparse_label_loss = True
        if distillation:
            L = gcv.loss.DistillationSoftmaxCrossEntropyLoss(
                temperature=opt.temperature,
                hard_weight=opt.hard_weight,
                sparse_label=sparse_label_loss)
        else:
            L = gluon.loss.SoftmaxCrossEntropyLoss(
                sparse_label=sparse_label_loss)

        best_val_score = 1

        err_top1_val, err_top5_val = test(ctx, val_data)
        logger.info('initial validation: err-top1=%f err-top5=%f' %
                    (err_top1_val, err_top5_val))

        for epoch in range(opt.resume_epoch, opt.num_epochs):
            tic = time.time()
            train_metric.reset()
            train_loss_metric.reset()
            btic = time.time()
            pbar = tqdm.tqdm(total=num_batches,
                             desc=f'Training [{epoch}]',
                             leave=True)
            for i, batch in enumerate(train_data):
                data, label = batch_fn(batch, ctx)

                if opt.mixup:
                    lam = np.random.beta(opt.mixup_alpha, opt.mixup_alpha)
                    if epoch >= opt.num_epochs - opt.mixup_off_epoch:
                        lam = 1
                    data = [lam * X + (1 - lam) * X[::-1] for X in data]

                    if opt.label_smoothing:
                        eta = 0.1
                    else:
                        eta = 0.0
                    label = mixup_transform(label, classes, lam, eta)

                elif opt.label_smoothing:
                    hard_label = label
                    label = smooth(label, classes)

                if distillation:
                    # teacher_prob = [nd.softmax(teacher(X.astype(opt.dtype, copy=False)) / opt.temperature) \
                    #                 for X in data]
                    with ag.predict_mode():
                        teacher_prob = [
                            nd.softmax(
                                teacher(
                                    nd.transpose(
                                        nd.image.resize(
                                            nd.transpose(X, (0, 2, 3, 1)),
                                            size=opt.teacher_imgsize),
                                        (0, 3, 1, 2))) / opt.temperature)
                            for X in data
                        ]

                with ag.record():
                    # outputs = [net(X.astype(opt.dtype, copy=False)) for X in data]
                    outputs = [net(X) for X in data]
                    if distillation:
                        # loss = [L(yhat.astype('float32', copy=False),
                        #           y.astype('float32', copy=False),
                        #           p.astype('float32', copy=False)) for yhat, y, p in zip(outputs, label, teacher_prob)]
                        # print([outputs, label, teacher_prob])
                        loss = [
                            L(yhat, y, p)
                            for yhat, y, p in zip(outputs, label, teacher_prob)
                        ]
                    else:
                        # loss = [L(yhat, y.astype(opt.dtype, copy=False)) for yhat, y in zip(outputs, label)]
                        loss = [L(yhat, y) for yhat, y in zip(outputs, label)]
                    if opt.amp:
                        with amp.scale_loss(loss, trainer) as scaled_loss:
                            ag.backward(scaled_loss)
                    else:
                        ag.backward(loss)
                if accumulate > 1:
                    if (i + 1) % accumulate == 0:
                        trainer.step(batch_size * accumulate)
                        net.collect_params().zero_grad()
                else:
                    trainer.step(batch_size)

                train_loss_metric.update(0, loss)

                if opt.mixup:
                    output_softmax = [nd.SoftmaxActivation(out.astype('float32', copy=False)) \
                                      for out in outputs]
                    train_metric.update(label, output_softmax)
                else:
                    if opt.label_smoothing:
                        train_metric.update(hard_label, outputs)
                    else:
                        train_metric.update(label, outputs)

                _, loss_score = train_loss_metric.get()
                train_metric_name, train_metric_score = train_metric.get()
                samplers_per_sec = batch_size / (time.time() - btic)
                postfix = f'{samplers_per_sec:.1f} imgs/sec, ' \
                          f'loss: {loss_score:.4f}, ' \
                          f'acc: {train_metric_score * 100:.2f}, ' \
                          f'lr: {trainer.learning_rate:.4e}'
                if opt.multi_scale:
                    postfix += f', size: {data[0].shape[-1]}'
                pbar.set_postfix_str(postfix)
                pbar.update()
                btic = time.time()
                if opt.log_interval and not (i + 1) % opt.log_interval:
                    step = epoch * num_batches + i
                    wandb.log(
                        {
                            'samplers_per_sec': samplers_per_sec,
                            train_metric_name: train_metric_score,
                            'lr': trainer.learning_rate,
                            'loss': loss_score
                        },
                        step=step)
                    logger.info(
                        'Epoch[%d] Batch [%d]\tSpeed: %f samples/sec\t%s=%f\tlr=%f'
                        % (epoch, i, samplers_per_sec, train_metric_name,
                           train_metric_score, trainer.learning_rate))

            pbar.close()
            train_metric_name, train_metric_score = train_metric.get()
            throughput = int(batch_size * i / (time.time() - tic))

            err_top1_val, err_top5_val = test(ctx, val_data)
            wandb.log({
                'err1': err_top1_val,
                'err5': err_top5_val
            },
                      step=epoch * num_batches)

            logger.info('[Epoch %d] training: %s=%f' %
                        (epoch, train_metric_name, train_metric_score))
            logger.info('[Epoch %d] speed: %d samples/sec\ttime cost: %f' %
                        (epoch, throughput, time.time() - tic))
            logger.info('[Epoch %d] validation: err-top1=%f err-top5=%f' %
                        (epoch, err_top1_val, err_top5_val))

            if err_top1_val < best_val_score:
                best_val_score = err_top1_val
                net.save_parameters(
                    '%s/%.4f-imagenet-%s-%d-best.params' %
                    (save_dir, best_val_score, model_name, epoch))
                trainer.save_states(
                    '%s/%.4f-imagenet-%s-%d-best.states' %
                    (save_dir, best_val_score, model_name, epoch))

            if save_frequency and save_dir and (epoch +
                                                1) % save_frequency == 0:
                net.save_parameters('%s/imagenet-%s-%d.params' %
                                    (save_dir, model_name, epoch))
                trainer.save_states('%s/imagenet-%s-%d.states' %
                                    (save_dir, model_name, epoch))

        if save_frequency and save_dir:
            net.save_parameters('%s/imagenet-%s-%d.params' %
                                (save_dir, model_name, opt.num_epochs - 1))
            trainer.save_states('%s/imagenet-%s-%d.states' %
                                (save_dir, model_name, opt.num_epochs - 1))
Exemple #12
0
def run(mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225],
        graphviz=True,
        epoch=100,
        input_size=[512, 512],
        batch_size=16,
        batch_log=100,
        batch_interval=10,
        subdivision=4,
        train_dataset_path="Dataset/train",
        valid_dataset_path="Dataset/valid",
        multiscale=True,
        factor_scale=[8, 5],
        data_augmentation=True,
        num_workers=4,
        optimizer="ADAM",
        lambda_off=1,
        lambda_size=0.1,
        save_period=5,
        load_period=10,
        learning_rate=0.001,
        decay_lr=0.999,
        decay_step=10,
        GPU_COUNT=0,
        base=18,
        pretrained_base=True,
        pretrained_path="modelparam",
        AMP=True,
        valid_size=8,
        eval_period=5,
        tensorboard=True,
        valid_graph_path="valid_Graph",
        using_mlflow=True,
        topk=100,
        plot_class_thresh=0.5):
    '''
    AMP 가 모든 연산을 지원하지는 않는다.
    modulated convolution을 지원하지 않음
    '''
    if GPU_COUNT == 0:
        ctx = mx.cpu(0)
        AMP = False
    elif GPU_COUNT == 1:
        ctx = mx.gpu(0)
    else:
        ctx = [mx.gpu(i) for i in range(GPU_COUNT)]

    # 운영체제 확인
    if platform.system() == "Linux":
        logging.info(f"{platform.system()} OS")
    elif platform.system() == "Windows":
        logging.info(f"{platform.system()} OS")
    else:
        logging.info(f"{platform.system()} OS")

    if isinstance(ctx, (list, tuple)):
        for i, c in enumerate(ctx):
            free_memory, total_memory = mx.context.gpu_memory_info(i)
            free_memory = round(free_memory / (1024 * 1024 * 1024), 2)
            total_memory = round(total_memory / (1024 * 1024 * 1024), 2)
            logging.info(
                f'Running on {c} / free memory : {free_memory}GB / total memory {total_memory}GB'
            )
    else:
        if GPU_COUNT == 1:
            free_memory, total_memory = mx.context.gpu_memory_info(0)
            free_memory = round(free_memory / (1024 * 1024 * 1024), 2)
            total_memory = round(total_memory / (1024 * 1024 * 1024), 2)
            logging.info(
                f'Running on {ctx} / free memory : {free_memory}GB / total memory {total_memory}GB'
            )
        else:
            logging.info(f'Running on {ctx}')

    if GPU_COUNT > 0 and batch_size < GPU_COUNT:
        logging.info("batch size must be greater than gpu number")
        exit(0)

    if AMP:
        amp.init()

    if multiscale:
        logging.info("Using MultiScale")

    if data_augmentation:
        logging.info("Using Data Augmentation")

    logging.info("training Center Detector")
    input_shape = (1, 3) + tuple(input_size)

    scale_factor = 4  # 고정
    logging.info(f"scale factor {scale_factor}")

    try:
        train_dataloader, train_dataset = traindataloader(
            multiscale=multiscale,
            factor_scale=factor_scale,
            augmentation=data_augmentation,
            path=train_dataset_path,
            input_size=input_size,
            batch_size=batch_size,
            batch_interval=batch_interval,
            num_workers=num_workers,
            shuffle=True,
            mean=mean,
            std=std,
            scale_factor=scale_factor,
            make_target=True)
        valid_dataloader, valid_dataset = validdataloader(
            path=valid_dataset_path,
            input_size=input_size,
            batch_size=valid_size,
            num_workers=num_workers,
            shuffle=True,
            mean=mean,
            std=std,
            scale_factor=scale_factor,
            make_target=True)

    except Exception as E:
        logging.info(E)
        exit(0)

    train_update_number_per_epoch = len(train_dataloader)
    if train_update_number_per_epoch < 1:
        logging.warning("train batch size가 데이터 수보다 큼")
        exit(0)

    valid_list = glob.glob(os.path.join(valid_dataset_path, "*"))
    if valid_list:
        valid_update_number_per_epoch = len(valid_dataloader)
        if valid_update_number_per_epoch < 1:
            logging.warning("valid batch size가 데이터 수보다 큼")
            exit(0)

    num_classes = train_dataset.num_class  # 클래스 수
    name_classes = train_dataset.classes

    optimizer = optimizer.upper()
    if pretrained_base:
        model = str(input_size[0]) + "_" + str(
            input_size[1]) + "_" + optimizer + "_P" + "CENTER_RES" + str(base)
    else:
        model = str(input_size[0]) + "_" + str(
            input_size[1]) + "_" + optimizer + "_CENTER_RES" + str(base)

    weight_path = f"weights/{model}"
    sym_path = os.path.join(weight_path, f'{model}-symbol.json')
    param_path = os.path.join(weight_path, f'{model}-{load_period:04d}.params')

    if os.path.exists(param_path) and os.path.exists(sym_path):
        start_epoch = load_period
        logging.info(f"loading {os.path.basename(param_path)} weights\n")
        net = gluon.SymbolBlock.imports(sym_path, ['data'],
                                        param_path,
                                        ctx=ctx)
    else:
        start_epoch = 0
        net = CenterNet(base=base,
                        heads=OrderedDict([('heatmap', {
                            'num_output': num_classes,
                            'bias': -2.19
                        }), ('offset', {
                            'num_output': 2
                        }), ('wh', {
                            'num_output': 2
                        })]),
                        head_conv_channel=64,
                        pretrained=pretrained_base,
                        root=pretrained_path,
                        use_dcnv2=False,
                        ctx=ctx)

        if isinstance(ctx, (list, tuple)):
            net.summary(mx.nd.ones(shape=input_shape, ctx=ctx[0]))
        else:
            net.summary(mx.nd.ones(shape=input_shape, ctx=ctx))
        '''
        active (bool, default True) – Whether to turn hybrid on or off.
        static_alloc (bool, default False) – Statically allocate memory to improve speed. Memory usage may increase.
        static_shape (bool, default False) – Optimize for invariant input shapes between iterations. Must also set static_alloc to True. Change of input shapes is still allowed but slower.
        '''
        if multiscale:
            net.hybridize(active=True, static_alloc=True, static_shape=False)
        else:
            net.hybridize(active=True, static_alloc=True, static_shape=True)

    if start_epoch + 1 >= epoch + 1:
        logging.info("this model has already been optimized")
        exit(0)

    if tensorboard:
        summary = SummaryWriter(logdir=os.path.join("mxboard", model),
                                max_queue=10,
                                flush_secs=10,
                                verbose=False)
        if isinstance(ctx, (list, tuple)):
            net.forward(mx.nd.ones(shape=input_shape, ctx=ctx[0]))
        else:
            net.forward(mx.nd.ones(shape=input_shape, ctx=ctx))
        summary.add_graph(net)
    if graphviz:
        gluoncv.utils.viz.plot_network(net,
                                       shape=input_shape,
                                       save_prefix=model)

    # optimizer
    unit = 1 if (len(train_dataset) //
                 batch_size) < 1 else len(train_dataset) // batch_size
    step = unit * decay_step
    lr_sch = mx.lr_scheduler.FactorScheduler(step=step,
                                             factor=decay_lr,
                                             stop_factor_lr=1e-12,
                                             base_lr=learning_rate)

    for p in net.collect_params().values():
        if p.grad_req != "null":
            p.grad_req = 'add'

    if AMP:
        '''
        update_on_kvstore : bool, default None
        Whether to perform parameter updates on kvstore. If None, then trainer will choose the more
        suitable option depending on the type of kvstore. If the `update_on_kvstore` argument is
        provided, environment variable `MXNET_UPDATE_ON_KVSTORE` will be ignored.
        '''
        if optimizer.upper() == "ADAM":
            trainer = gluon.Trainer(
                net.collect_params(),
                optimizer,
                optimizer_params={
                    "learning_rate": learning_rate,
                    "lr_scheduler": lr_sch,
                    "beta1": 0.9,
                    "beta2": 0.999,
                    'multi_precision': False
                },
                update_on_kvstore=False)  # for Dynamic loss scaling
        elif optimizer.upper() == "RMSPROP":
            trainer = gluon.Trainer(
                net.collect_params(),
                optimizer,
                optimizer_params={
                    "learning_rate": learning_rate,
                    "lr_scheduler": lr_sch,
                    "gamma1": 0.9,
                    "gamma2": 0.999,
                    'multi_precision': False
                },
                update_on_kvstore=False)  # for Dynamic loss scaling
        elif optimizer.upper() == "SGD":
            trainer = gluon.Trainer(
                net.collect_params(),
                optimizer,
                optimizer_params={
                    "learning_rate": learning_rate,
                    "lr_scheduler": lr_sch,
                    "wd": 0.0001,
                    "momentum": 0.9,
                    'multi_precision': False
                },
                update_on_kvstore=False)  # for Dynamic loss scaling
        else:
            logging.error("optimizer not selected")
            exit(0)

        amp.init_trainer(trainer)

    else:
        if optimizer.upper() == "ADAM":
            trainer = gluon.Trainer(net.collect_params(),
                                    optimizer,
                                    optimizer_params={
                                        "learning_rate": learning_rate,
                                        "lr_scheduler": lr_sch,
                                        "beta1": 0.9,
                                        "beta2": 0.999,
                                        'multi_precision': False
                                    })
        elif optimizer.upper() == "RMSPROP":
            trainer = gluon.Trainer(net.collect_params(),
                                    optimizer,
                                    optimizer_params={
                                        "learning_rate": learning_rate,
                                        "lr_scheduler": lr_sch,
                                        "gamma1": 0.9,
                                        "gamma2": 0.999,
                                        'multi_precision': False
                                    })
        elif optimizer.upper() == "SGD":
            trainer = gluon.Trainer(net.collect_params(),
                                    optimizer,
                                    optimizer_params={
                                        "learning_rate": learning_rate,
                                        "lr_scheduler": lr_sch,
                                        "wd": 0.0001,
                                        "momentum": 0.9,
                                        'multi_precision': False
                                    })

        else:
            logging.error("optimizer not selected")
            exit(0)

    heatmapfocalloss = HeatmapFocalLoss(from_sigmoid=True, alpha=2, beta=4)
    normedl1loss = NormedL1Loss()
    prediction = Prediction(batch_size=valid_size,
                            topk=topk,
                            scale=scale_factor)
    precision_recall = Voc_2007_AP(iou_thresh=0.5, class_names=name_classes)

    start_time = time.time()
    for i in tqdm(range(start_epoch + 1, epoch + 1, 1),
                  initial=start_epoch + 1,
                  total=epoch):

        heatmap_loss_sum = 0
        offset_loss_sum = 0
        wh_loss_sum = 0
        time_stamp = time.time()
        '''
        target generator를 train_dataloader에서 만들어 버리는게 학습 속도가 훨씬 빠르다. 
        '''

        for batch_count, (image, _, heatmap, offset_target, wh_target,
                          mask_target, _) in enumerate(train_dataloader,
                                                       start=1):
            td_batch_size = image.shape[0]

            image_split = mx.nd.split(data=image,
                                      num_outputs=subdivision,
                                      axis=0)
            heatmap_split = mx.nd.split(data=heatmap,
                                        num_outputs=subdivision,
                                        axis=0)
            offset_target_split = mx.nd.split(data=offset_target,
                                              num_outputs=subdivision,
                                              axis=0)
            wh_target_split = mx.nd.split(data=wh_target,
                                          num_outputs=subdivision,
                                          axis=0)
            mask_target_split = mx.nd.split(data=mask_target,
                                            num_outputs=subdivision,
                                            axis=0)

            if subdivision == 1:
                image_split = [image_split]
                heatmap_split = [heatmap_split]
                offset_target_split = [offset_target_split]
                wh_target_split = [wh_target_split]
                mask_target_split = [mask_target_split]
            '''
            autograd 설명
            https://mxnet.apache.org/api/python/docs/tutorials/getting-started/crash-course/3-autograd.html
            '''
            with autograd.record(train_mode=True):

                heatmap_all_losses = []
                offset_all_losses = []
                wh_all_losses = []

                for image_part, heatmap_part, offset_target_part, wh_target_part, mask_target_part in zip(
                        image_split, heatmap_split, offset_target_split,
                        wh_target_split, mask_target_split):

                    if GPU_COUNT <= 1:
                        image_part = gluon.utils.split_and_load(
                            image_part, [ctx], even_split=False)
                        heatmap_part = gluon.utils.split_and_load(
                            heatmap_part, [ctx], even_split=False)
                        offset_target_part = gluon.utils.split_and_load(
                            offset_target_part, [ctx], even_split=False)
                        wh_target_part = gluon.utils.split_and_load(
                            wh_target_part, [ctx], even_split=False)
                        mask_target_part = gluon.utils.split_and_load(
                            mask_target_part, [ctx], even_split=False)
                    else:
                        image_part = gluon.utils.split_and_load(
                            image_part, ctx, even_split=False)
                        heatmap_part = gluon.utils.split_and_load(
                            heatmap_part, ctx, even_split=False)
                        offset_target_part = gluon.utils.split_and_load(
                            offset_target_part, ctx, even_split=False)
                        wh_target_part = gluon.utils.split_and_load(
                            wh_target_part, ctx, even_split=False)
                        mask_target_part = gluon.utils.split_and_load(
                            mask_target_part, ctx, even_split=False)

                    # prediction, target space for Data Parallelism
                    heatmap_losses = []
                    offset_losses = []
                    wh_losses = []
                    total_loss = []

                    # gpu N 개를 대비한 코드 (Data Parallelism)
                    for img, heatmap_target, offset_target, wh_target, mask_target in zip(
                            image_part, heatmap_part, offset_target_part,
                            wh_target_part, mask_target_part):
                        heatmap_pred, offset_pred, wh_pred = net(img)
                        heatmap_loss = heatmapfocalloss(
                            heatmap_pred, heatmap_target)
                        offset_loss = normedl1loss(offset_pred, offset_target,
                                                   mask_target) * lambda_off
                        wh_loss = normedl1loss(wh_pred, wh_target,
                                               mask_target) * lambda_size

                        heatmap_losses.append(heatmap_loss.asscalar())
                        offset_losses.append(offset_loss.asscalar())
                        wh_losses.append(wh_loss.asscalar())

                        total_loss.append(heatmap_loss + offset_loss + wh_loss)

                    if AMP:
                        with amp.scale_loss(total_loss,
                                            trainer) as scaled_loss:
                            autograd.backward(scaled_loss)
                    else:
                        autograd.backward(total_loss)

                    heatmap_all_losses.append(sum(heatmap_losses))
                    offset_all_losses.append(sum(offset_losses))
                    wh_all_losses.append(sum(wh_losses))

            trainer.step(batch_size=td_batch_size, ignore_stale_grad=False)
            # 비우기

            for p in net.collect_params().values():
                p.zero_grad()

            heatmap_loss_sum += sum(heatmap_all_losses) / td_batch_size
            offset_loss_sum += sum(offset_all_losses) / td_batch_size
            wh_loss_sum += sum(wh_all_losses) / td_batch_size

            if batch_count % batch_log == 0:
                logging.info(
                    f'[Epoch {i}][Batch {batch_count}/{train_update_number_per_epoch}],'
                    f'[Speed {td_batch_size / (time.time() - time_stamp):.3f} samples/sec],'
                    f'[Lr = {trainer.learning_rate}]'
                    f'[heatmap loss = {sum(heatmap_all_losses) / td_batch_size:.3f}]'
                    f'[offset loss = {sum(offset_all_losses) / td_batch_size:.3f}]'
                    f'[wh loss = {sum(wh_all_losses) / td_batch_size:.3f}]')
            time_stamp = time.time()

        train_heatmap_loss_mean = np.divide(heatmap_loss_sum,
                                            train_update_number_per_epoch)
        train_offset_loss_mean = np.divide(offset_loss_sum,
                                           train_update_number_per_epoch)
        train_wh_loss_mean = np.divide(wh_loss_sum,
                                       train_update_number_per_epoch)
        train_total_loss_mean = train_heatmap_loss_mean + train_offset_loss_mean + train_wh_loss_mean

        logging.info(
            f"train heatmap loss : {train_heatmap_loss_mean} / train offset loss : {train_offset_loss_mean} / train wh loss : {train_wh_loss_mean} / train total loss : {train_total_loss_mean}"
        )

        if i % eval_period == 0 and valid_list:

            heatmap_loss_sum = 0
            offset_loss_sum = 0
            wh_loss_sum = 0

            # loss 구하기
            for image, label, heatmap_all, offset_target_all, wh_target_all, mask_target_all, _ in valid_dataloader:
                vd_batch_size = image.shape[0]

                if GPU_COUNT <= 1:
                    image = gluon.utils.split_and_load(image, [ctx],
                                                       even_split=False)
                    label = gluon.utils.split_and_load(label, [ctx],
                                                       even_split=False)
                    heatmap_split = gluon.utils.split_and_load(
                        heatmap_all, [ctx], even_split=False)
                    offset_target_split = gluon.utils.split_and_load(
                        offset_target_all, [ctx], even_split=False)
                    wh_target_split = gluon.utils.split_and_load(
                        wh_target_all, [ctx], even_split=False)
                    mask_target_split = gluon.utils.split_and_load(
                        mask_target_all, [ctx], even_split=False)
                else:
                    image = gluon.utils.split_and_load(image,
                                                       ctx,
                                                       even_split=False)
                    label = gluon.utils.split_and_load(label,
                                                       ctx,
                                                       even_split=False)
                    heatmap_split = gluon.utils.split_and_load(
                        heatmap_all, ctx, even_split=False)
                    offset_target_split = gluon.utils.split_and_load(
                        offset_target_all, ctx, even_split=False)
                    wh_target_split = gluon.utils.split_and_load(
                        wh_target_all, ctx, even_split=False)
                    mask_target_split = gluon.utils.split_and_load(
                        mask_target_all, ctx, even_split=False)

                # prediction, target space for Data Parallelism
                heatmap_losses = []
                offset_losses = []
                wh_losses = []

                # gpu N 개를 대비한 코드 (Data Parallelism)
                for img, lb, heatmap_target, offset_target, wh_target, mask_target in zip(
                        image, label, heatmap_split, offset_target_split,
                        wh_target_split, mask_target_split):
                    gt_box = lb[:, :, :4]
                    gt_id = lb[:, :, 4:5]
                    heatmap_pred, offset_pred, wh_pred = net(img)

                    id, score, bbox = prediction(heatmap_pred, offset_pred,
                                                 wh_pred)
                    precision_recall.update(pred_bboxes=bbox,
                                            pred_labels=id,
                                            pred_scores=score,
                                            gt_boxes=gt_box * scale_factor,
                                            gt_labels=gt_id)

                    heatmap_loss = heatmapfocalloss(heatmap_pred,
                                                    heatmap_target)
                    offset_loss = normedl1loss(offset_pred, offset_target,
                                               mask_target) * lambda_off
                    wh_loss = normedl1loss(wh_pred, wh_target,
                                           mask_target) * lambda_size

                    heatmap_losses.append(heatmap_loss.asscalar())
                    offset_losses.append(offset_loss.asscalar())
                    wh_losses.append(wh_loss.asscalar())

                heatmap_loss_sum += sum(heatmap_losses) / vd_batch_size
                offset_loss_sum += sum(offset_losses) / vd_batch_size
                wh_loss_sum += sum(wh_losses) / vd_batch_size

            valid_heatmap_loss_mean = np.divide(heatmap_loss_sum,
                                                valid_update_number_per_epoch)
            valid_offset_loss_mean = np.divide(offset_loss_sum,
                                               valid_update_number_per_epoch)
            valid_wh_loss_mean = np.divide(wh_loss_sum,
                                           valid_update_number_per_epoch)
            valid_total_loss_mean = valid_heatmap_loss_mean + valid_offset_loss_mean + valid_wh_loss_mean

            logging.info(
                f"valid heatmap loss : {valid_heatmap_loss_mean} / valid offset loss : {valid_offset_loss_mean} / valid wh loss : {valid_wh_loss_mean} / valid total loss : {valid_total_loss_mean}"
            )

            AP_appender = []
            round_position = 2
            class_name, precision, recall, true_positive, false_positive, threshold = precision_recall.get_PR_list(
            )
            for j, c, p, r in zip(range(len(recall)), class_name, precision,
                                  recall):
                name, AP = precision_recall.get_AP(c, p, r)
                logging.info(
                    f"class {j}'s {name} AP : {round(AP * 100, round_position)}%"
                )
                AP_appender.append(AP)
            mAP_result = np.mean(AP_appender)

            logging.info(f"mAP : {round(mAP_result * 100, round_position)}%")
            precision_recall.get_PR_curve(name=class_name,
                                          precision=precision,
                                          recall=recall,
                                          threshold=threshold,
                                          AP=AP_appender,
                                          mAP=mAP_result,
                                          folder_name=valid_graph_path,
                                          epoch=i)
            precision_recall.reset()

            if tensorboard:
                # gpu N 개를 대비한 코드 (Data Parallelism)
                dataloader_iter = iter(valid_dataloader)
                image, label, _, _, _, _, _ = next(dataloader_iter)

                if GPU_COUNT <= 1:
                    image = gluon.utils.split_and_load(image, [ctx],
                                                       even_split=False)
                    label = gluon.utils.split_and_load(label, [ctx],
                                                       even_split=False)
                else:
                    image = gluon.utils.split_and_load(image,
                                                       ctx,
                                                       even_split=False)
                    label = gluon.utils.split_and_load(label,
                                                       ctx,
                                                       even_split=False)

                ground_truth_colors = {}
                for k in range(num_classes):
                    ground_truth_colors[k] = (0, 0, 1)

                batch_image = []
                heatmap_image = []
                for img, lb in zip(image, label):
                    gt_boxes = lb[:, :, :4]
                    gt_ids = lb[:, :, 4:5]
                    heatmap_pred, offset_pred, wh_pred = net(img)
                    ids, scores, bboxes = prediction(heatmap_pred, offset_pred,
                                                     wh_pred)

                    for ig, gt_id, gt_box, heatmap, id, score, bbox in zip(
                            img, gt_ids, gt_boxes, heatmap_pred, ids, scores,
                            bboxes):
                        ig = ig.transpose((1, 2, 0)) * mx.nd.array(
                            std, ctx=ig.context) + mx.nd.array(mean,
                                                               ctx=ig.context)
                        ig = (ig * 255).clip(0, 255)

                        # heatmap 그리기
                        heatmap = mx.nd.multiply(heatmap,
                                                 255.0)  # 0 ~ 255 범위로 바꾸기
                        heatmap = mx.nd.max(
                            heatmap, axis=0,
                            keepdims=True)  # channel 축으로 가장 큰것 뽑기
                        heatmap = mx.nd.transpose(
                            heatmap,
                            axes=(1, 2, 0))  # (height, width, channel=1)
                        heatmap = mx.nd.repeat(
                            heatmap, repeats=3,
                            axis=-1)  # (height, width, channel=3)
                        heatmap = heatmap.asnumpy(
                        )  # mxnet.ndarray -> numpy.ndarray
                        heatmap = cv2.resize(heatmap,
                                             dsize=(input_size[1],
                                                    input_size[0]))  # 사이즈 원복
                        heatmap = heatmap.astype("uint8")  # float32 -> uint8
                        heatmap = cv2.applyColorMap(heatmap, cv2.COLORMAP_JET)
                        heatmap[:, :,
                                (0, 1, 2)] = heatmap[:, :,
                                                     (2, 1, 0)]  # BGR -> RGB
                        heatmap = np.transpose(
                            heatmap,
                            axes=(2, 0, 1))  # (channel=3, height, width)

                        # ground truth box 그리기
                        ground_truth = plot_bbox(
                            ig,
                            gt_box * scale_factor,
                            scores=None,
                            labels=gt_id,
                            thresh=None,
                            reverse_rgb=True,
                            class_names=valid_dataset.classes,
                            absolute_coordinates=True,
                            colors=ground_truth_colors)
                        # prediction box 그리기
                        prediction_box = plot_bbox(
                            ground_truth,
                            bbox,
                            scores=score,
                            labels=id,
                            thresh=plot_class_thresh,
                            reverse_rgb=False,
                            class_names=valid_dataset.classes,
                            absolute_coordinates=True)

                        # Tensorboard에 그리기 위해 BGR -> RGB / (height, width, channel) -> (channel, height, width) 를한다.
                        prediction_box = cv2.cvtColor(prediction_box,
                                                      cv2.COLOR_BGR2RGB)
                        prediction_box = np.transpose(prediction_box,
                                                      axes=(2, 0, 1))
                        batch_image.append(
                            prediction_box)  # (batch, channel, height, width)
                        heatmap_image.append(heatmap)

                all_image = np.concatenate(
                    [np.array(batch_image),
                     np.array(heatmap_image)], axis=-1)
                summary.add_image(tag="valid_result",
                                  image=all_image,
                                  global_step=i)
                summary.add_scalar(tag="heatmap_loss",
                                   value={
                                       "train_heatmap_loss_mean":
                                       train_heatmap_loss_mean,
                                       "valid_heatmap_loss_mean":
                                       valid_heatmap_loss_mean
                                   },
                                   global_step=i)
                summary.add_scalar(tag="offset_loss",
                                   value={
                                       "train_offset_loss_mean":
                                       train_offset_loss_mean,
                                       "valid_offset_loss_mean":
                                       valid_offset_loss_mean
                                   },
                                   global_step=i)
                summary.add_scalar(tag="wh_loss",
                                   value={
                                       "train_wh_loss_mean":
                                       train_wh_loss_mean,
                                       "valid_wh_loss_mean": valid_wh_loss_mean
                                   },
                                   global_step=i)

                summary.add_scalar(tag="total_loss",
                                   value={
                                       "train_total_loss":
                                       train_total_loss_mean,
                                       "valid_total_loss":
                                       valid_total_loss_mean
                                   },
                                   global_step=i)

                params = net.collect_params().values()
                if GPU_COUNT > 1:
                    for c in ctx:
                        for p in params:
                            summary.add_histogram(tag=p.name,
                                                  values=p.data(ctx=c),
                                                  global_step=i,
                                                  bins='default')
                else:
                    for p in params:
                        summary.add_histogram(tag=p.name,
                                              values=p.data(),
                                              global_step=i,
                                              bins='default')

        if i % save_period == 0:

            if not os.path.exists(weight_path):
                os.makedirs(weight_path)
            '''
            Hybrid models can be serialized as JSON files using the export function
            Export HybridBlock to json format that can be loaded by SymbolBlock.imports, mxnet.mod.Module or the C++ interface.
            When there are only one input, it will have name data. When there Are more than one inputs, they will be named as data0, data1, etc.
            '''
            if GPU_COUNT >= 1:
                context = mx.gpu(0)
            else:
                context = mx.cpu(0)

            postnet = PostNet(net=net, auxnet=prediction)  # 새로운 객체가 생성
            try:
                net.export(os.path.join(weight_path, f"{model}"),
                           epoch=i,
                           remove_amp_cast=True)
                net.save_parameters(os.path.join(weight_path,
                                                 f"{i}.params"))  # onnx 추출용
                # network inference, decoder, nms까지 처리됨 - mxnet c++에서 편리함
                export_block_for_cplusplus(
                    path=os.path.join(weight_path, f"{model}_prepost"),
                    block=postnet,
                    data_shape=tuple(input_size) + tuple((3, )),
                    epoch=i,
                    preprocess=
                    True,  # c++ 에서 inference시 opencv에서 읽은 이미지 그대로 넣으면 됨
                    layout='HWC',
                    ctx=context,
                    remove_amp_cast=True)

            except Exception as E:
                logging.error(f"json, param model export 예외 발생 : {E}")
            else:
                logging.info("json, param model export 성공")
                net.collect_params().reset_ctx(ctx)

    end_time = time.time()
    learning_time = end_time - start_time
    logging.info(f"learning time : 약, {learning_time / 3600:0.2f}H")
    logging.info("optimization completed")

    if using_mlflow:
        ml.log_metric("learning time", round(learning_time / 3600, 2))
Exemple #13
0
def model_fit(args, net, train_data, eval_metric, optimizer, optimizer_params,
              lr_scheduler, eval_data, global_metrics, kvstore, kv,
              begin_epoch, num_epoch, run_epoch, model_prefix):
    if not isinstance(eval_metric, mx.metric.EvalMetric):
        eval_metric = mx.metric.create(eval_metric)
    loss_metric = ScalarMetric()

    if 'horovod' in kvstore:
        trainer = hvd.DistributedTrainer(net.collect_params(), optimizer,
                                         optimizer_params)
    else:
        trainer = gluon.Trainer(net.collect_params(),
                                optimizer,
                                optimizer_params,
                                kvstore=kv,
                                update_on_kvstore=False)

    if args.amp:
        amp.init_trainer(trainer)

    sparse_label_loss = (args.label_smoothing == 0 and args.mixup == 0)
    loss = gluon.loss.SoftmaxCrossEntropyLoss(sparse_label=sparse_label_loss)
    loss.hybridize(static_shape=True, static_alloc=True)

    local_batch_size = train_data.batch_size
    total_batch_size = local_batch_size * train_data._num_gpus * (
        hvd.size() if 'horovod' in kvstore else 1)
    durations = []

    epoch_size = get_epoch_size(args, kv)
    run_epoch = num_epoch if (run_epoch == -1) else (begin_epoch + run_epoch)

    def transform_data(images, labels):
        if args.mixup != 0:
            coeffs = mx.nd.array(
                np.random.beta(args.mixup, args.mixup,
                               size=images.shape[0])).as_in_context(
                                   images.context)
            image_coeffs = coeffs.astype(images.dtype, copy=False).reshape(
                *coeffs.shape, 1, 1, 1)
            ret_images = image_coeffs * images + (1 -
                                                  image_coeffs) * images[::-1]

            ret_labels = label_smoothing(labels, args.num_classes,
                                         args.label_smoothing)
            label_coeffs = coeffs.reshape(*coeffs.shape, 1)
            ret_labels = label_coeffs * ret_labels + (
                1 - label_coeffs) * ret_labels[::-1]
        else:
            ret_images = images
            if not sparse_label_loss:
                ret_labels = label_smoothing(labels, args.num_classes,
                                             args.label_smoothing)
            else:
                ret_labels = labels

        return ret_images, ret_labels

    i = -1
    best_accuracy = -1
    for epoch in range(begin_epoch, min(run_epoch, num_epoch)):
        tic = time.time()
        btic = time.time()
        etic = time.time()

        train_data.reset()
        eval_metric.reset()
        loss_metric.reset()

        logging.info('Starting epoch {}'.format(epoch))
        outputs = []
        for i, batches in enumerate(train_data):
            # synchronize to previous iteration
            #for o in outputs:
            #    o.wait_to_read()

            trainer.set_learning_rate(lr_scheduler(epoch + i / epoch_size))

            data = [b.data[0] for b in batches]
            label = [
                b.label[0].as_in_context(b.data[0].context) for b in batches
            ]
            orig_label = label

            data, label = zip(*starmap(transform_data, zip(data, label)))

            outputs = []
            Ls = []
            with ag.record():
                for x, y in zip(data, label):
                    z = net(x)
                    L = loss(z, y)
                    # store the loss and do backward after we have done forward
                    # on all GPUs for better speed on multiple GPUs.
                    Ls.append(L)
                    outputs.append(z)

                if args.amp:
                    with amp.scale_loss(Ls, trainer) as scaled_loss:
                        ag.backward(scaled_loss)
                else:
                    ag.backward(Ls)

            if 'horovod' in kvstore:
                trainer.step(local_batch_size)
            else:
                trainer.step(total_batch_size)

            loss_metric.update(..., np.mean([l.asnumpy() for l in Ls]).item())

            if args.disp_batches and not (i + 1) % args.disp_batches:
                dllogger_it_data = {
                    'train.loss':
                    loss_metric.get()[1],
                    'train.ips':
                    args.disp_batches * total_batch_size /
                    (time.time() - btic),
                    'train.lr':
                    trainer.learning_rate
                }
                dllogger.log((epoch, i), data=dllogger_it_data)

                loss_metric.reset_local()
                btic = time.time()

            durations.append(time.time() - tic)
            tic = time.time()

        durations = durations[min(len(durations) // 10, 100):]
        dllogger_epoch_data = {
            'train.loss': loss_metric.get_global()[1],
            'train.ips': total_batch_size / np.mean(durations)
        }
        if args.mode == 'train_val':
            logging.info('Validating epoch {}'.format(epoch))
            score, duration_stats, _ = model_score(args, net, eval_data,
                                                   eval_metric, kvstore)

            dllogger_epoch_data.update(
                starmap(lambda key, val: ('val.{}'.format(key), val),
                        zip(*score)))
            dllogger_epoch_data.update(
                starmap(lambda key, val: ('val.{}'.format(key), val),
                        duration_stats.items()))

            score = dict(zip(*score))
            accuracy = score.get('accuracy', -1)
            save_checkpoint(net, epoch, accuracy, best_accuracy, model_prefix,
                            args.save_frequency, kvstore)
            best_accuracy = max(best_accuracy, accuracy)
        global_metrics.update_dict(dllogger_epoch_data)
        dllogger.log(step=(epoch, ), data=dllogger_epoch_data)
Exemple #14
0
    def train(epochs, ctx):
        if isinstance(ctx, mx.Context):
            ctx = [ctx]
        net.initialize(mx.init.MSRAPrelu(), ctx=ctx)

        root = os.path.join('..', 'datasets', 'cifar-10')
        train_data = gluon.data.DataLoader(
            gluon.data.vision.CIFAR10(
                root=root, train=True).transform_first(transform_train),
            batch_size=batch_size, shuffle=True, last_batch='discard', num_workers=num_workers)

        val_data = gluon.data.DataLoader(
            gluon.data.vision.CIFAR10(
                root=root, train=False).transform_first(transform_test),
            batch_size=batch_size, shuffle=False, num_workers=num_workers)

        trainer = gluon.Trainer(net.collect_params(), optimizer,
                                {'learning_rate': opt.lr, 'wd': opt.wd,
                                 'momentum': opt.momentum, 'lr_scheduler': lr_sch})
        if opt.amp:
            amp.init_trainer(trainer)
        metric = mx.metric.Accuracy()
        train_metric = mx.metric.RMSE()
        loss_fn = gluon.loss.SoftmaxCrossEntropyLoss(
            sparse_label=False if opt.mixup else True)
        train_history = TrainingHistory(['training-error', 'validation-error'])
        # acc_history = TrainingHistory(['training-acc', 'validation-acc'])
        loss_history = TrainingHistory(['training-loss', 'validation-loss'])

        iteration = 0

        best_val_score = 0

        for epoch in range(epochs):
            tic = time.time()
            train_metric.reset()
            metric.reset()
            train_loss = 0
            num_batch = len(train_data)
            alpha = 1

            for i, batch in enumerate(train_data):
                if epoch == 0 and iteration == 1 and opt.profile_mode:
                    profiler.set_state('run')
                lam = np.random.beta(alpha, alpha)
                if epoch >= epochs - 20 or not opt.mixup:
                    lam = 1

                data_1 = gluon.utils.split_and_load(
                    batch[0], ctx_list=ctx, batch_axis=0)
                label_1 = gluon.utils.split_and_load(
                    batch[1], ctx_list=ctx, batch_axis=0)

                if not opt.mixup:
                    data = data_1
                    label = label_1
                else:
                    data = [lam*X + (1-lam)*X[::-1] for X in data_1]
                    label = []
                    for Y in label_1:
                        y1 = label_transform(Y, classes)
                        y2 = label_transform(Y[::-1], classes)
                        label.append(lam*y1 + (1-lam)*y2)

                with ag.record():
                    output = [net(X) for X in data]
                    loss = [loss_fn(yhat, y) for yhat, y in zip(output, label)]
                if opt.amp:
                    with ag.record():
                        with amp.scale_loss(loss, trainer) as scaled_loss:
                            ag.backward(scaled_loss)
                            # scaled_loss.backward()
                else:
                    for l in loss:
                        l.backward()
                trainer.step(batch_size)
                train_loss += sum([l.sum().asscalar() for l in loss])

                output_softmax = [nd.SoftmaxActivation(out) for out in output]
                train_metric.update(label, output_softmax)
                metric.update(label_1, output_softmax)
                name, acc = train_metric.get()
                sw.add_scalar(tag='lr', value=trainer.learning_rate,
                              global_step=iteration)
                if epoch == 0 and iteration == 1 and opt.profile_mode:
                    nd.waitall()
                    profiler.set_state('stop')
                iteration += 1

            train_loss /= batch_size * num_batch
            name, acc = train_metric.get()
            _, train_acc = metric.get()
            name, val_acc, _ = test(ctx, val_data)
            if opt.mixup:
                train_history.update([acc, 1-val_acc])
                plt.cla()
                train_history.plot(save_path='%s/%s_history.png' %
                                   (plot_name, model_name))
            else:
                train_history.update([1-train_acc, 1-val_acc])
                plt.cla()
                train_history.plot(save_path='%s/%s_history.png' %
                                   (plot_name, model_name))
            # acc_history.update([train_acc, val_acc])
            # plt.cla()
            # acc_history.plot(save_path='%s/%s_acc.png' %
            #                  (plot_name, model_name), legend_loc='best')

            if val_acc > best_val_score:
                best_val_score = val_acc
                net.save_parameters('%s/%.4f-cifar-%s-%d-best.params' %
                                    (save_dir, best_val_score, model_name, epoch))

            current_lr = trainer.learning_rate
            name, val_acc, val_loss = test(ctx, val_data)
            loss_history.update([train_loss, val_loss])
            plt.cla()
            loss_history.plot(save_path='%s/%s_loss.png' %
                              (plot_name, model_name), y_lim=(0, 2), legend_loc='best')
            logging.info('[Epoch %d] loss=%f train_acc=%f train_RMSE=%f\n     val_acc=%f val_loss=%f lr=%f time: %f' %
                         (epoch, train_loss, train_acc, acc, val_acc, val_loss, current_lr, time.time()-tic))
            sw._add_scalars(tag='Acc',
                            scalar_dict={'train_acc': train_acc, 'test_acc': val_acc}, global_step=epoch)
            sw._add_scalars(tag='Loss',
                            scalar_dict={'train_loss': train_loss, 'test_loss': val_loss}, global_step=epoch)
            if save_period and save_dir and (epoch + 1) % save_period == 0:
                net.save_parameters('%s/cifar10-%s-%d.params' %
                                    (save_dir, model_name, epoch))
        if save_period and save_dir:
            net.save_parameters('%s/cifar10-%s-%d.params' %
                                (save_dir, model_name, epochs-1))
Exemple #15
0
def run(mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225],
        anchor_alloc_size=[256, 256],
        anchor_sizes=[32, 64, 128, 256, 512],
        anchor_size_ratios=[1, pow(2, 1 / 3), pow(2, 2 / 3)],
        anchor_aspect_ratios=[0.5, 1, 2],
        anchor_box_clip=True,
        graphviz=True,
        epoch=100,
        input_size=[512, 512],
        batch_log=100,
        batch_size=16,
        batch_interval=10,
        subdivision=4,
        train_dataset_path="Dataset/train",
        valid_dataset_path="Dataset/valid",
        multiscale=True,
        factor_scale=[8, 5],
        foreground_iou_thresh=0.5,
        background_iou_thresh=0.4,
        data_augmentation=True,
        num_workers=4,
        optimizer="ADAM",
        weight_decay=0.000001,
        save_period=5,
        load_period=10,
        learning_rate=0.001,
        decay_lr=0.999,
        decay_step=10,
        GPU_COUNT=0,
        base=0,
        AMP=True,
        valid_size=8,
        eval_period=5,
        tensorboard=True,
        valid_graph_path="valid_Graph",
        valid_html_auto_open=True,
        using_mlflow=True,
        decode_number=5000,
        multiperclass=True,
        nms_thresh=0.5,
        nms_topk=500,
        iou_thresh=0.5,
        except_class_thresh=0.05,
        plot_class_thresh=0.5):
    if GPU_COUNT == 0:
        ctx = mx.cpu(0)
        AMP = False
    elif GPU_COUNT == 1:
        ctx = mx.gpu(0)
    else:
        ctx = [mx.gpu(i) for i in range(GPU_COUNT)]

    # 운영체제 확인
    if platform.system() == "Linux":
        logging.info(f"{platform.system()} OS")
    elif platform.system() == "Windows":
        logging.info(f"{platform.system()} OS")
    else:
        logging.info(f"{platform.system()} OS")

    if isinstance(ctx, (list, tuple)):
        for i, c in enumerate(ctx):
            free_memory, total_memory = mx.context.gpu_memory_info(i)
            free_memory = round(free_memory / (1024 * 1024 * 1024), 2)
            total_memory = round(total_memory / (1024 * 1024 * 1024), 2)
            logging.info(
                f'Running on {c} / free memory : {free_memory}GB / total memory {total_memory}GB'
            )
    else:
        if GPU_COUNT == 1:
            free_memory, total_memory = mx.context.gpu_memory_info(0)
            free_memory = round(free_memory / (1024 * 1024 * 1024), 2)
            total_memory = round(total_memory / (1024 * 1024 * 1024), 2)
            logging.info(
                f'Running on {ctx} / free memory : {free_memory}GB / total memory {total_memory}GB'
            )
        else:
            logging.info(f'Running on {ctx}')

    if GPU_COUNT > 0 and batch_size < GPU_COUNT:
        logging.info("batch size must be greater than gpu number")
        exit(0)

    if AMP:
        amp.init()

    if multiscale:
        logging.info("Using MultiScale")

    if data_augmentation:
        logging.info("Using Data Augmentation")

    logging.info("training Efficient Detector")
    input_shape = (1, 3) + tuple(input_size)

    net = Efficient(version=base,
                    anchor_sizes=anchor_sizes,
                    anchor_size_ratios=anchor_size_ratios,
                    anchor_aspect_ratios=anchor_aspect_ratios,
                    anchor_box_clip=anchor_box_clip,
                    alloc_size=anchor_alloc_size,
                    ctx=mx.cpu())
    train_dataloader, train_dataset = traindataloader(
        multiscale=multiscale,
        factor_scale=factor_scale,
        augmentation=data_augmentation,
        path=train_dataset_path,
        input_size=input_size,
        batch_size=batch_size,
        batch_interval=batch_interval,
        num_workers=num_workers,
        shuffle=True,
        mean=mean,
        std=std,
        net=net,
        foreground_iou_thresh=foreground_iou_thresh,
        background_iou_thresh=background_iou_thresh,
        make_target=True)

    train_update_number_per_epoch = len(train_dataloader)
    if train_update_number_per_epoch < 1:
        logging.warning("train batch size가 데이터 수보다 큼")
        exit(0)

    valid_list = glob.glob(os.path.join(valid_dataset_path, "*"))
    if valid_list:
        valid_dataloader, valid_dataset = validdataloader(
            path=valid_dataset_path,
            input_size=input_size,
            batch_size=valid_size,
            num_workers=num_workers,
            shuffle=True,
            mean=mean,
            std=std,
            net=net,
            foreground_iou_thresh=foreground_iou_thresh,
            background_iou_thresh=background_iou_thresh,
            make_target=True)
        valid_update_number_per_epoch = len(valid_dataloader)
        if valid_update_number_per_epoch < 1:
            logging.warning("valid batch size가 데이터 수보다 큼")
            exit(0)

    num_classes = train_dataset.num_class  # 클래스 수
    name_classes = train_dataset.classes

    optimizer = optimizer.upper()
    model = str(input_size[0]) + "_" + str(
        input_size[1]) + "_" + optimizer + "_EFF_" + str(base)

    weight_path = os.path.join("weights", f"{model}")
    sym_path = os.path.join(weight_path, f'{model}-symbol.json')
    param_path = os.path.join(weight_path, f'{model}-{load_period:04d}.params')
    optimizer_path = os.path.join(weight_path,
                                  f'{model}-{load_period:04d}.opt')

    if os.path.exists(param_path) and os.path.exists(sym_path):
        start_epoch = load_period
        logging.info(f"loading {os.path.basename(param_path)}\n")
        net = gluon.SymbolBlock.imports(sym_path, ['data'],
                                        param_path,
                                        ctx=ctx)
    else:
        start_epoch = 0
        net = Efficient(
            version=base,
            input_size=input_size,
            anchor_sizes=anchor_sizes,
            anchor_size_ratios=anchor_size_ratios,
            anchor_aspect_ratios=anchor_aspect_ratios,
            num_classes=num_classes,  # foreground만
            anchor_box_clip=anchor_box_clip,
            alloc_size=anchor_alloc_size,
            ctx=ctx)

        if isinstance(ctx, (list, tuple)):
            net.summary(mx.nd.ones(shape=input_shape, ctx=ctx[0]))
        else:
            net.summary(mx.nd.ones(shape=input_shape, ctx=ctx))
        '''
        active (bool, default True) – Whether to turn hybrid on or off.
        static_alloc (bool, default False) – Statically allocate memory to improve speed. Memory usage may increase.
        static_shape (bool, default False) – Optimize for invariant input shapes between iterations. Must also set static_alloc to True. Change of input shapes is still allowed but slower.
        '''
        if multiscale:
            net.hybridize(active=True, static_alloc=True, static_shape=False)
        else:
            net.hybridize(active=True, static_alloc=True, static_shape=True)

    if start_epoch + 1 >= epoch + 1:
        logging.info("this model has already been optimized")
        exit(0)

    if tensorboard:
        summary = SummaryWriter(logdir=os.path.join("mxboard", model),
                                max_queue=10,
                                flush_secs=10,
                                verbose=False)
        if isinstance(ctx, (list, tuple)):
            net.forward(mx.nd.ones(shape=input_shape, ctx=ctx[0]))
        else:
            net.forward(mx.nd.ones(shape=input_shape, ctx=ctx))
        summary.add_graph(net)
    if graphviz:
        gluoncv.utils.viz.plot_network(net,
                                       shape=input_shape,
                                       save_prefix=model)

    # optimizer
    unit = 1 if (len(train_dataset) //
                 batch_size) < 1 else len(train_dataset) // batch_size
    step = unit * decay_step
    lr_sch = mx.lr_scheduler.FactorScheduler(step=step,
                                             factor=decay_lr,
                                             stop_factor_lr=1e-12,
                                             base_lr=learning_rate)

    for p in net.collect_params().values():
        if p.grad_req != "null":
            p.grad_req = 'add'
    '''
    update_on_kvstore : bool, default None
    Whether to perform parameter updates on kvstore. If None, then trainer will choose the more
    suitable option depending on the type of kvstore. If the `update_on_kvstore` argument is
    provided, environment variable `MXNET_UPDATE_ON_KVSTORE` will be ignored.
    '''
    if optimizer.upper() == "ADAM":
        trainer = gluon.Trainer(net.collect_params(),
                                optimizer,
                                optimizer_params={
                                    "learning_rate": learning_rate,
                                    "lr_scheduler": lr_sch,
                                    "wd": weight_decay,
                                    "beta1": 0.9,
                                    "beta2": 0.999,
                                    'multi_precision': False
                                },
                                update_on_kvstore=False
                                if AMP else None)  # for Dynamic loss scaling
    elif optimizer.upper() == "RMSPROP":
        trainer = gluon.Trainer(net.collect_params(),
                                optimizer,
                                optimizer_params={
                                    "learning_rate": learning_rate,
                                    "lr_scheduler": lr_sch,
                                    "wd": weight_decay,
                                    "gamma1": 0.9,
                                    "gamma2": 0.999,
                                    'multi_precision': False
                                },
                                update_on_kvstore=False
                                if AMP else None)  # for Dynamic loss scaling
    elif optimizer.upper() == "SGD":
        trainer = gluon.Trainer(net.collect_params(),
                                optimizer,
                                optimizer_params={
                                    "learning_rate": learning_rate,
                                    "lr_scheduler": lr_sch,
                                    "wd": weight_decay,
                                    "momentum": 0.9,
                                    'multi_precision': False
                                },
                                update_on_kvstore=False
                                if AMP else None)  # for Dynamic loss scaling
    else:
        logging.error("optimizer not selected")
        exit(0)

    if AMP:
        amp.init_trainer(trainer)

    # optimizer weight 불러오기
    if os.path.exists(optimizer_path):
        try:
            trainer.load_states(optimizer_path)
        except Exception as E:
            logging.info(E)
        else:
            logging.info(f"loading {os.path.basename(optimizer_path)}\n")
    '''
    localization loss -> Smooth L1 loss 
    confidence loss -> Focal 
    '''
    confidence_loss = FocalLoss(alpha=0.25,
                                gamma=2,
                                sparse_label=True,
                                from_sigmoid=False,
                                batch_axis=None,
                                num_class=num_classes,
                                reduction="sum",
                                exclude=False)

    localization_loss = HuberLoss(rho=1,
                                  batch_axis=None,
                                  reduction="sum",
                                  exclude=False)

    prediction = Prediction(batch_size=batch_size,
                            from_sigmoid=False,
                            num_classes=num_classes,
                            decode_number=decode_number,
                            nms_thresh=nms_thresh,
                            nms_topk=nms_topk,
                            except_class_thresh=except_class_thresh,
                            multiperclass=multiperclass)

    precision_recall = Voc_2007_AP(iou_thresh=iou_thresh,
                                   class_names=name_classes)

    ctx_list = ctx if isinstance(ctx, (list, tuple)) else [ctx]
    start_time = time.time()
    for i in tqdm(range(start_epoch + 1, epoch + 1, 1),
                  initial=start_epoch + 1,
                  total=epoch):

        conf_loss_sum = 0
        loc_loss_sum = 0
        time_stamp = time.time()

        for batch_count, (image, _, cls_all, box_all,
                          _) in enumerate(train_dataloader, start=1):
            td_batch_size = image.shape[0]

            image = mx.nd.split(data=image, num_outputs=subdivision, axis=0)
            cls_all = mx.nd.split(data=cls_all,
                                  num_outputs=subdivision,
                                  axis=0)
            box_all = mx.nd.split(data=box_all,
                                  num_outputs=subdivision,
                                  axis=0)

            if subdivision == 1:
                image = [image]
                cls_all = [cls_all]
                box_all = [box_all]
            '''
            autograd 설명
            https://mxnet.apache.org/api/python/docs/tutorials/getting-started/crash-course/3-autograd.html
            '''
            with autograd.record(train_mode=True):

                cls_all_losses = []
                box_all_losses = []

                for image_split, cls_split, box_split in zip(
                        image, cls_all, box_all):

                    image_split = gluon.utils.split_and_load(image_split,
                                                             ctx_list,
                                                             even_split=False)
                    cls_split = gluon.utils.split_and_load(cls_split,
                                                           ctx_list,
                                                           even_split=False)
                    box_split = gluon.utils.split_and_load(box_split,
                                                           ctx_list,
                                                           even_split=False)

                    # prediction, target space for Data Parallelism
                    cls_losses = []
                    box_losses = []
                    total_loss = []

                    # gpu N 개를 대비한 코드 (Data Parallelism)
                    for img, cls_target, box_target in zip(
                            image_split, cls_split, box_split):
                        cls_pred, box_pred, anchor = net(img)
                        except_ignore_samples = cls_target > -1
                        positive_samples = cls_target > 0
                        positive_numbers = positive_samples.sum()

                        conf_loss = confidence_loss(
                            cls_pred, cls_target,
                            except_ignore_samples.expand_dims(axis=-1))
                        conf_loss = mx.nd.divide(conf_loss,
                                                 positive_numbers + 1)
                        cls_losses.append(conf_loss.asscalar())

                        loc_loss = localization_loss(
                            box_pred, box_target,
                            positive_samples.expand_dims(axis=-1))
                        box_losses.append(loc_loss.asscalar())

                        total_loss.append(conf_loss + loc_loss)
                    if AMP:
                        with amp.scale_loss(total_loss,
                                            trainer) as scaled_loss:
                            autograd.backward(scaled_loss)
                    else:
                        autograd.backward(total_loss)

                    cls_all_losses.append(sum(cls_losses))
                    box_all_losses.append(sum(box_losses))

            trainer.step(batch_size=td_batch_size, ignore_stale_grad=False)
            # 비우기
            for p in net.collect_params().values():
                p.zero_grad()

            conf_loss_sum += sum(cls_all_losses) / td_batch_size
            loc_loss_sum += sum(box_all_losses) / td_batch_size

            if batch_count % batch_log == 0:
                logging.info(
                    f'[Epoch {i}][Batch {batch_count}/{train_update_number_per_epoch}],'
                    f'[Speed {td_batch_size / (time.time() - time_stamp):.3f} samples/sec],'
                    f'[Lr = {trainer.learning_rate}]'
                    f'[confidence loss = {sum(cls_all_losses) / td_batch_size:.3f}]'
                    f'[localization loss = {sum(box_all_losses) / td_batch_size:.3f}]'
                )
            time_stamp = time.time()

        train_conf_loss_mean = np.divide(conf_loss_sum,
                                         train_update_number_per_epoch)
        train_loc_loss_mean = np.divide(loc_loss_sum,
                                        train_update_number_per_epoch)
        train_total_loss_mean = train_conf_loss_mean + train_loc_loss_mean

        logging.info(
            f"train confidence loss : {train_conf_loss_mean} / train localization loss : {train_loc_loss_mean} / train total loss : {train_total_loss_mean}"
        )

        if i % save_period == 0:

            weight_epoch_path = os.path.join(weight_path, str(i))
            if not os.path.exists(weight_epoch_path):
                os.makedirs(weight_epoch_path)

            # optimizer weight 저장하기
            try:
                trainer.save_states(
                    os.path.join(weight_path, f'{model}-{i:04d}.opt'))
            except Exception as E:
                logging.error(f"optimizer weight export 예외 발생 : {E}")
            else:
                logging.info("optimizer weight export 성공")
            '''
            Hybrid models can be serialized as JSON files using the export function
            Export HybridBlock to json format that can be loaded by SymbolBlock.imports, mxnet.mod.Module or the C++ interface.
            When there are only one input, it will have name data. When there Are more than one inputs, they will be named as data0, data1, etc.
            '''
            if GPU_COUNT >= 1:
                context = mx.gpu(0)
            else:
                context = mx.cpu(0)
            '''
                mxnet1.6.0 버전 에서 AMP 사용시 위에 미리 선언한 prediction을 사용하면 문제가 될 수 있다. 
                -yolo v3, gaussian yolo v3 에서는 문제가 발생한다.
                mxnet 1.5.x 버전에서는 아래와 같이 새로 선언하지 않아도 정상 동작한다.  

                block들은 함수 인자로 보낼 경우 자기 자신이 보내진다.(복사되는 것이 아님)
                export_block_for_cplusplus 에서 prediction 이 hybridize 되면서 
                미리 선언한 prediction도 hybridize화 되면서 symbol 형태가 된다. 
                이런 현상을 보면 아래와같이 다시 선언해 주는게 맞는 것 같다.
            '''
            auxnet = Prediction(from_sigmoid=False,
                                num_classes=num_classes,
                                decode_number=decode_number,
                                nms_thresh=nms_thresh,
                                nms_topk=nms_topk,
                                except_class_thresh=except_class_thresh,
                                multiperclass=multiperclass)
            postnet = PostNet(net=net, auxnet=auxnet)
            try:
                net.export(os.path.join(weight_path, f"{model}"),
                           epoch=i,
                           remove_amp_cast=True)
                net.save_parameters(os.path.join(weight_path,
                                                 f"{i}.params"))  # onnx 추출용
                # network inference, decoder, nms까지 처리됨 - mxnet c++에서 편리함
                export_block_for_cplusplus(
                    path=os.path.join(weight_epoch_path, f"{model}_prepost"),
                    block=postnet,
                    data_shape=tuple(input_size) + tuple((3, )),
                    epoch=i,
                    preprocess=
                    True,  # c++ 에서 inference시 opencv에서 읽은 이미지 그대로 넣으면 됨
                    layout='HWC',
                    ctx=context,
                    remove_amp_cast=True)

            except Exception as E:
                logging.error(f"json, param model export 예외 발생 : {E}")
            else:
                logging.info("json, param model export 성공")
                net.collect_params().reset_ctx(ctx)

        if i % eval_period == 0 and valid_list:

            conf_loss_sum = 0
            loc_loss_sum = 0

            # loss 구하기
            for image, label, cls_all, box_all, _ in valid_dataloader:

                vd_batch_size = image.shape[0]

                image = gluon.utils.split_and_load(image,
                                                   ctx_list,
                                                   even_split=False)
                label = gluon.utils.split_and_load(label,
                                                   ctx_list,
                                                   even_split=False)
                cls_all = gluon.utils.split_and_load(cls_all,
                                                     ctx_list,
                                                     even_split=False)
                box_all = gluon.utils.split_and_load(box_all,
                                                     ctx_list,
                                                     even_split=False)

                # prediction, target space for Data Parallelism
                cls_losses = []
                box_losses = []

                # gpu N 개를 대비한 코드 (Data Parallelism)
                for img, lb, cls_target, box_target in zip(
                        image, label, cls_all, box_all):
                    gt_box = lb[:, :, :4]
                    gt_id = lb[:, :, 4:5]
                    cls_pred, box_pred, anchor = net(img)
                    id, score, bbox = prediction(cls_pred, box_pred, anchor)

                    precision_recall.update(pred_bboxes=bbox,
                                            pred_labels=id,
                                            pred_scores=score,
                                            gt_boxes=gt_box,
                                            gt_labels=gt_id)

                    except_ignore_samples = cls_target > -1
                    positive_samples = cls_target > 0
                    positive_numbers = positive_samples.sum()

                    conf_loss = confidence_loss(
                        cls_pred, cls_target,
                        except_ignore_samples.expand_dims(axis=-1))
                    conf_loss = mx.nd.divide(conf_loss, positive_numbers + 1)
                    cls_losses.append(conf_loss.asscalar())

                    loc_loss = localization_loss(
                        box_pred, box_target,
                        positive_samples.expand_dims(axis=-1))
                    box_losses.append(loc_loss.asscalar())

                conf_loss_sum += sum(cls_losses) / vd_batch_size
                loc_loss_sum += sum(box_losses) / vd_batch_size

            valid_conf_loss_mean = np.divide(conf_loss_sum,
                                             valid_update_number_per_epoch)
            valid_loc_loss_mean = np.divide(loc_loss_sum,
                                            valid_update_number_per_epoch)
            valid_total_loss_mean = valid_conf_loss_mean + valid_loc_loss_mean

            logging.info(
                f"valid confidence loss : {valid_conf_loss_mean} / valid localization loss : {valid_loc_loss_mean} / valid total loss : {valid_total_loss_mean}"
            )

            AP_appender = []
            round_position = 2
            class_name, precision, recall, true_positive, false_positive, threshold = precision_recall.get_PR_list(
            )
            for j, c, p, r in zip(range(len(recall)), class_name, precision,
                                  recall):
                name, AP = precision_recall.get_AP(c, p, r)
                logging.info(
                    f"class {j}'s {name} AP : {round(AP * 100, round_position)}%"
                )
                AP_appender.append(AP)

            AP_appender = np.nan_to_num(AP_appender)
            mAP_result = np.mean(AP_appender)

            logging.info(f"mAP : {round(mAP_result * 100, round_position)}%")
            precision_recall.get_PR_curve(name=class_name,
                                          precision=precision,
                                          recall=recall,
                                          threshold=threshold,
                                          AP=AP_appender,
                                          mAP=mAP_result,
                                          folder_name=valid_graph_path,
                                          epoch=i,
                                          auto_open=valid_html_auto_open)
            precision_recall.reset()

            if tensorboard:
                # gpu N 개를 대비한 코드 (Data Parallelism)
                dataloader_iter = iter(valid_dataloader)
                image, label, _, _, _ = next(dataloader_iter)

                image = gluon.utils.split_and_load(image,
                                                   ctx_list,
                                                   even_split=False)
                label = gluon.utils.split_and_load(label,
                                                   ctx_list,
                                                   even_split=False)

                ground_truth_colors = {}
                for k in range(num_classes):
                    ground_truth_colors[k] = (0, 1, 0)

                batch_image = []
                for img, lb in zip(image, label):
                    gt_boxes = lb[:, :, :4]
                    gt_ids = lb[:, :, 4:5]
                    cls_pred, box_pred, anchor = net(img)
                    ids, scores, bboxes = prediction(cls_pred, box_pred,
                                                     anchor)

                    for ig, gt_id, gt_box, id, score, bbox in zip(
                            img, gt_ids, gt_boxes, ids, scores, bboxes):
                        ig = ig.transpose((1, 2, 0)) * mx.nd.array(
                            std, ctx=ig.context) + mx.nd.array(mean,
                                                               ctx=ig.context)
                        ig = (ig * 255).clip(0, 255)
                        ig = ig.astype(np.uint8)

                        # ground truth box 그리기
                        ground_truth = plot_bbox(
                            ig,
                            gt_box,
                            scores=None,
                            labels=gt_id,
                            thresh=None,
                            reverse_rgb=False,
                            class_names=valid_dataset.classes,
                            absolute_coordinates=True,
                            colors=ground_truth_colors)
                        # prediction box 그리기
                        prediction_box = plot_bbox(
                            ground_truth,
                            bbox,
                            scores=score,
                            labels=id,
                            thresh=plot_class_thresh,
                            reverse_rgb=False,
                            class_names=valid_dataset.classes,
                            absolute_coordinates=True)

                        # Tensorboard에 그리기 (height, width, channel) -> (channel, height, width) 를한다.
                        prediction_box = np.transpose(prediction_box,
                                                      axes=(2, 0, 1))
                        batch_image.append(
                            prediction_box)  # (batch, channel, height, width)

                summary.add_image(tag="valid_result",
                                  image=np.array(batch_image),
                                  global_step=i)
                summary.add_scalar(tag="conf_loss",
                                   value={
                                       "train_conf_loss": train_conf_loss_mean,
                                       "valid_conf_loss": valid_conf_loss_mean
                                   },
                                   global_step=i)
                summary.add_scalar(tag="loc_loss",
                                   value={
                                       "train_loc_loss": train_loc_loss_mean,
                                       "valid_loc_loss": valid_loc_loss_mean
                                   },
                                   global_step=i)
                summary.add_scalar(tag="total_loss",
                                   value={
                                       "train_total_loss":
                                       train_total_loss_mean,
                                       "valid_total_loss":
                                       valid_total_loss_mean
                                   },
                                   global_step=i)

                for p in net.collect_params().values():
                    summary.add_histogram(tag=p.name,
                                          values=p.data(ctx=ctx_list[0]),
                                          global_step=i,
                                          bins='default')

    end_time = time.time()
    learning_time = end_time - start_time
    logging.info(f"learning time : 약, {learning_time / 3600:0.2f}H")
    logging.info("optimization completed")

    if using_mlflow:
        ml.log_metric("learning time", round(learning_time / 3600, 2))
Exemple #16
0
def train(net, train_data, val_data, eval_metric, ctx, args):
    """Training pipeline"""
    net.collect_params().setattr('grad_req', 'null')
    net.collect_train_params().setattr('grad_req', 'write')
    for k, v in net.collect_params('.*beta|.*bias').items():
        v.wd_mult = 0.0

    if args.horovod:
        hvd.broadcast_parameters(net.collect_params(), root_rank=0)
        trainer = hvd.DistributedTrainer(
                        net.collect_train_params(), # fix batchnorm, fix first stage, etc...
                        'sgd',
                        {'learning_rate': args.lr, 'wd': args.wd, 'momentum': args.momentum})
    else:
        trainer = gluon.Trainer(
                    net.collect_train_params(), # fix batchnorm, fix first stage, etc...
                    'sgd',
                    {'learning_rate': args.lr, 'wd': args.wd, 'momentum': args.momentum},
                    update_on_kvstore=(False if args.amp else None))

    if args.amp:
        amp.init_trainer(trainer)

    # lr decay policy
    lr_decay = float(args.lr_decay)
    lr_steps = sorted([float(ls) for ls in args.lr_decay_epoch.split(',') if ls.strip()])
    lr_warmup = float(args.lr_warmup)  # avoid int division

    rpn_cls_loss = mx.gluon.loss.SigmoidBinaryCrossEntropyLoss(from_sigmoid=False)
    rpn_box_loss = mx.gluon.loss.HuberLoss(rho=1 / 9.)  # == smoothl1
    rcnn_cls_loss = mx.gluon.loss.SoftmaxCrossEntropyLoss()
    rcnn_box_loss = mx.gluon.loss.HuberLoss()  # == smoothl1
    rcnn_mask_loss = mx.gluon.loss.SigmoidBinaryCrossEntropyLoss(from_sigmoid=False)
    metrics = [mx.metric.Loss('RPN_Conf'),
               mx.metric.Loss('RPN_SmoothL1'),
               mx.metric.Loss('RCNN_CrossEntropy'),
               mx.metric.Loss('RCNN_SmoothL1'),
               mx.metric.Loss('RCNN_Mask')]

    rpn_acc_metric = RPNAccMetric()
    rpn_bbox_metric = RPNL1LossMetric()
    rcnn_acc_metric = RCNNAccMetric()
    rcnn_bbox_metric = RCNNL1LossMetric()
    rcnn_mask_metric = MaskAccMetric()
    rcnn_fgmask_metric = MaskFGAccMetric()
    metrics2 = [rpn_acc_metric, rpn_bbox_metric,
                rcnn_acc_metric, rcnn_bbox_metric,
                rcnn_mask_metric, rcnn_fgmask_metric]

    # set up logger
    logging.basicConfig()
    logger = logging.getLogger()
    logger.setLevel(logging.INFO)
    log_file_path = args.save_prefix + '_train.log'
    log_dir = os.path.dirname(log_file_path)
    if log_dir and not os.path.exists(log_dir):
        os.makedirs(log_dir)
    fh = logging.FileHandler(log_file_path)
    logger.addHandler(fh)
    logger.info(args)
    if args.verbose:
        logger.info('Trainable parameters:')
        logger.info(net.collect_train_params().keys())
    logger.info('Start training from [Epoch {}]'.format(args.start_epoch))
    best_map = [0]
    for epoch in range(args.start_epoch, args.epochs):
        while lr_steps and epoch >= lr_steps[0]:
            new_lr = trainer.learning_rate * lr_decay
            lr_steps.pop(0)
            trainer.set_learning_rate(new_lr)
            logger.info("[Epoch {}] Set learning rate to {}".format(epoch, new_lr))
        for metric in metrics:
            metric.reset()
        tic = time.time()
        btic = time.time()
        if not args.disable_hybridization:
            net.hybridize(static_alloc=args.static_alloc)
        base_lr = trainer.learning_rate
        for i, batch in enumerate(train_data):
            if epoch == 0 and i <= lr_warmup:
                # adjust based on real percentage
                new_lr = base_lr * get_lr_at_iter(i / lr_warmup)
                if new_lr != trainer.learning_rate:
                    if i % args.log_interval == 0:
                        logger.info(
                            '[Epoch 0 Iteration {}] Set learning rate to {}'.format(i, new_lr))
                    trainer.set_learning_rate(new_lr)
            batch = split_and_load(batch, ctx_list=ctx)
            batch_size = len(batch[0])
            losses = []
            metric_losses = [[] for _ in metrics]
            add_losses = [[] for _ in metrics2]
            with autograd.record():
                for data, label, gt_mask, rpn_cls_targets, rpn_box_targets, rpn_box_masks in zip(
                        *batch):
                    gt_label = label[:, :, 4:5]
                    gt_box = label[:, :, :4]
                    cls_pred, box_pred, mask_pred, roi, samples, matches, rpn_score, rpn_box, anchors = net(
                        data, gt_box)
                    # losses of rpn
                    rpn_score = rpn_score.squeeze(axis=-1)
                    num_rpn_pos = (rpn_cls_targets >= 0).sum()
                    rpn_loss1 = rpn_cls_loss(rpn_score, rpn_cls_targets,
                                             rpn_cls_targets >= 0) * rpn_cls_targets.size / num_rpn_pos
                    rpn_loss2 = rpn_box_loss(rpn_box, rpn_box_targets,
                                             rpn_box_masks) * rpn_box.size / num_rpn_pos
                    # rpn overall loss, use sum rather than average
                    rpn_loss = rpn_loss1 + rpn_loss2
                    # generate targets for rcnn
                    cls_targets, box_targets, box_masks = net.target_generator(roi, samples,
                                                                               matches, gt_label,
                                                                               gt_box)
                    # losses of rcnn
                    num_rcnn_pos = (cls_targets >= 0).sum()
                    rcnn_loss1 = rcnn_cls_loss(cls_pred, cls_targets,
                                               cls_targets >= 0) * cls_targets.size / \
                                 cls_targets.shape[0] / num_rcnn_pos
                    rcnn_loss2 = rcnn_box_loss(box_pred, box_targets, box_masks) * box_pred.size / \
                                 box_pred.shape[0] / num_rcnn_pos
                    rcnn_loss = rcnn_loss1 + rcnn_loss2
                    # generate targets for mask
                    mask_targets, mask_masks = net.mask_target(roi, gt_mask, matches, cls_targets)
                    # loss of mask
                    mask_loss = rcnn_mask_loss(mask_pred, mask_targets, mask_masks) * \
                                mask_targets.size / mask_targets.shape[0] / mask_masks.sum()
                    # overall losses
                    losses.append(rpn_loss.sum() + rcnn_loss.sum() + mask_loss.sum())
                    if (not args.horovod or hvd.rank() == 0):
                        metric_losses[0].append(rpn_loss1.sum())
                        metric_losses[1].append(rpn_loss2.sum())
                        metric_losses[2].append(rcnn_loss1.sum())
                        metric_losses[3].append(rcnn_loss2.sum())
                        metric_losses[4].append(mask_loss.sum())
                        add_losses[0].append([[rpn_cls_targets, rpn_cls_targets >= 0], [rpn_score]])
                        add_losses[1].append([[rpn_box_targets, rpn_box_masks], [rpn_box]])
                        add_losses[2].append([[cls_targets], [cls_pred]])
                        add_losses[3].append([[box_targets, box_masks], [box_pred]])
                        add_losses[4].append([[mask_targets, mask_masks], [mask_pred]])
                        add_losses[5].append([[mask_targets, mask_masks], [mask_pred]])
                if args.amp:
                    with amp.scale_loss(losses, trainer) as scaled_losses:
                        autograd.backward(scaled_losses)
                else:
                    autograd.backward(losses)
                if (not args.horovod or hvd.rank() == 0):
                    for metric, record in zip(metrics, metric_losses):
                        metric.update(0, record)
                    for metric, records in zip(metrics2, add_losses):
                        for pred in records:
                            metric.update(pred[0], pred[1])
            trainer.step(batch_size)
            # update metrics
            if (not args.horovod or hvd.rank() == 0) and args.log_interval and not (i + 1) % args.log_interval:
                msg = ','.join(['{}={:.3f}'.format(*metric.get()) for metric in metrics + metrics2])
                logger.info('[Epoch {}][Batch {}], Speed: {:.3f} samples/sec, {}'.format(
                    epoch, i, args.log_interval * args.batch_size / (time.time() - btic), msg))
                btic = time.time()
        # validate and save params
        if (not args.horovod or hvd.rank() == 0):
            msg = ','.join(['{}={:.3f}'.format(*metric.get()) for metric in metrics])
            logger.info('[Epoch {}] Training cost: {:.3f}, {}'.format(
                epoch, (time.time() - tic), msg))
            if not (epoch + 1) % args.val_interval:
                # consider reduce the frequency of validation to save time
                map_name, mean_ap = validate(net, val_data, ctx, eval_metric, args)
                val_msg = '\n'.join(['{}={}'.format(k, v) for k, v in zip(map_name, mean_ap)])
                logger.info('[Epoch {}] Validation: \n{}'.format(epoch, val_msg))
                current_map = float(mean_ap[-1])
            else:
                current_map = 0.
            save_params(net, logger, best_map, current_map, epoch, args.save_interval, args.save_prefix)
Exemple #17
0
def train(net, train_data, val_data, eval_metric, ctx, args):
    net.collect_params.reset_ctx(ctx)
    if args.horovod:
        hvd.broadcast_parameters(net.collect_params(), root_rank=0)
        trainer = hvd.DistributedTrainer(net.collect_params(), 'sgd', {
            'learning_rate': args.lr,
            'wd': args.wd,
            'momentum': args.momentum
        })
    else:
        trainer = gluon.Trainer(
            net.collect_params(),
            'sgd', {
                'learning_rate': args.lr,
                'wd': args.wd,
                'momentum': args.momentum
            },
            update_on_kvstore=(False if args.amp else None))

    if args.amp:
        amp.init_trainer(trainer)

    lr_decay = float(args.lr_decay)
    lr_steps = sorted(
        [float(ls) for ls in args.lr_decay_epoch.split(',') if ls.strip()])

    mbox_loss = gcv.loss.SSDMultiBoxLoss(
    )  # compute loss in entire batch across devices
    ce_metric = mx.metric.Loss('CrossEntropy')  # 记录cls 的loss
    smoothl1_metric = mx.metric.Loss('SmoothL1')  # 记录box 偏移量的loss

    # logger set_up
    logging.basicConfig()
    logger = logging.getLogger()
    logger.setLevel(logging.INFO)
    log_file_path = args.save_prefix + '_train.log'
    log_dir = os.path.dirname(log_file_path)
    if log_dir and not os.path.exists(log_dir):
        os.makedirs(log_dir)
    fh = logging.FileHandler(log_file_path)
    logger.addHandler(fh)
    logger.info(args)
    logger.info("Start training from [Epoch {}]".format(args.start_epoch))
    best_map = [0]

    for epoch in range(args.start_epoch, args.epochs):

        # 重新设置 learning_rate
        while lr_steps and epoch >= lr_steps[0]:
            new_lr = trainer.learning_rate * lr_decay
            lr_steps.pop(0)
            trainer.set_learning_rate(new_lr)
            logger.info('[Epoch {}] Set learning rate to {}'.format(
                epoch, new_lr))

        ce_metric.reset()
        smoothl1_metric.reset()
        tic = time.time()  # 记录一次循环的时间
        btic = time.time()  # 记录每一个batch的时间
        net.hybridize(static_alloc=True, statis_shape=True)

        for i, batch in enumerate(train_data):
            # get data , target box and target box
            """if args.dali:
                data = [d.data[0] for d in batch]
                box_targets = [d.label[0] for d in batch]
                cls_targets = [nd.cast(d.label[1], dtype='float32') for d in batch]

            else:"""
            data = gluon.utils.split_and_load(batch[0],
                                              ctx_list=ctx,
                                              batch_axis=0)
            cls_targets = gluon.utils.split_and_load(batch[1],
                                                     ctx_list=ctx,
                                                     batch_axis=0)
            box_targets = gluon.utils.split_and_load(batch[2],
                                                     ctx_list=ctx,
                                                     batch_axis=0)
            """
            x, y: y本就包含着该图片之前所有生成的锚框target的box位置信息即box_targets(batch_size, N, 4), N是锚框的个数
            以及每个锚框对应的类别即cls_targets(batch_size, N)

             """
            with autograd.record():
                cls_preds, box_preds, _ = net(data)
                # cls_preds: (batch_size, num_anchors, num_cls + 1)
                sum_loss, cls_loss, box_loss = mbox_loss(
                    cls_preds, box_preds, cls_targets, box_targets)
                # 计算loss的时候,是Compute loss in entire batch across devices.
                # 也就是: divide by the sum of num positive targets in batch
                # sum_loss, cls_loss, box_loss 的形状???
                if args.amp:
                    with amp.scale_loss(sum_loss, trainer) as scaled_loss:
                        scaled_loss.backward()
                else:
                    sum_loss.backward()
            trainer.step(1)
            # since we have already normalized the loss, we don't want to normalize
            # by batch-size anymore

            if (not args.horovod or hvd.rank() == 0):
                local_batch_size = int(args.batch_size //
                                       (hvd.size() if args.horovod else 1))
                ce_metric.update(0, [l * local_batch_size for l in cls_loss])
                smoothl1_metric.update(
                    0, [l * local_batch_size for l in box_loss])
                # ce_metric 和 smoothl1_metric 为什么要乘以local_batch_size...T_T
                # to get loss per image
                # ce_metric.get(),smoothl1_metric.get() 方法里面会除以batch_size
                # 所以在这之前,要先乘以batch_size, 否则就会变成loss/num_anchors/(batch_size * batch_size)

                if args.log_interval and not (i + 1) % args.log_interval:
                    # 每隔args.log_interval就记录一次
                    name1, loss1 = ce_metric.get()
                    name2, loss2 = smoothl1_metric.get()
                    logger.info(
                        '[Epoch {}][Batch {}], Speed: {:.3f} samples/sec, {}={:.3f}, {}={:.3f}'
                        .format(epoch, i,
                                args.batch_size / (time.time() - btic), name1,
                                loss1, name2, loss2))
                btic = time.time()

        if (not args.horovod) or hvd.rank() == 0:
            name1, loss1 = ce_metric.get()
            name2, loss2 = smoothl1_metric.get()
            logger.info(
                '[Epoch {}] Training cost: {:.3f}, {}={:.3f}, {}={:.3f}'.
                format(epoch, (time.time() - tic), name1, loss1, name2, loss2))
            if (epoch % args.val_interval
                    == 0) or (args.save_interval
                              and epoch % args.save_interval == 0):
                # 每循环args.val_interval或者args.save_interval次
                # 就需要使用验证集来测试一次,得到current_map
                map_name, mean_ap = validate(net, val_data, ctx, eval_metric)
                val_msg = "\n".join('{}={}'.format(k, v)
                                    for k, v in zip(map_name, mean_ap))
                logger.info('[Epoch {}] Validation: \n{}'.format(
                    epoch, val_msg))
                current_map = float(mean_ap[-1])  # mean_ap的最后一个数据就是mAP
            else:
                current_map = 0
            save_params(net, best_map, current_map, epoch, args.save_interval,
                        args.save_prefix)
    def forward_backward(self, x):
        data, label, rpn_cls_targets, rpn_box_targets, rpn_box_masks = x
        with autograd.record():
            gt_label = label[:, :, 4:5]
            gt_box = label[:, :, :4]
            # cls_pred (BS, num_samples, C+1)
            # box_pred (BS, num_pos, C, 4)
            # rpn_box  (BS, num_samples, 4)
            # samples  (BS, num_samples)   gt_class_id
            # matches  (BS, num_samples)   gt_indices
            # raw_rpn_score (BS, num_anchors, 1)
            # raw_rpn_box   (BS, num_anchors, 4)
            # anchors       (BS, num_anchors, 4)
            # cls_targets   (BS, num_samples)
            # box_targets   (BS, num_pos, C, 4)
            # box_masks     (BS, num_pos, C, 4)
            # indices       (BS, num_pos)       相对于rpn_box的序号
            # roi           (BS, rpn_post_nms, 4) rpn返回的roi
            # cls_pred, box_pred, rpn_box, samples, matches, raw_rpn_score, raw_rpn_box, anchors, cls_targets, \
            #     box_targets, box_masks, indices = self.net(data, gt_box, gt_label)
            cls_pred, box_pred, _, _, _Z, rpn_score, rpn_box, _, cls_targets, \
                box_targets, box_masks, _, roi = self.net(data, gt_box, gt_label)
            # losses of rpn
            rpn_score = rpn_score.squeeze(axis=-1)
            # rpn_cls_targets: 1: pos 0: neg -1: ignore
            num_rpn_pos = (rpn_cls_targets >= 0).sum()
            rpn_loss1 = self.rpn_cls_loss(
                rpn_score, rpn_cls_targets,
                rpn_cls_targets >= 0) * rpn_cls_targets.size / num_rpn_pos
            rpn_loss2 = self.rpn_box_loss(
                rpn_box, rpn_box_targets,
                rpn_box_masks) * rpn_box.size / num_rpn_pos
            # rpn overall loss, use sum rather than average
            rpn_loss = rpn_loss1 + rpn_loss2
            # losses of rcnn
            num_rcnn_pos = (cls_targets >= 0).sum()
            rcnn_loss1 = self.rcnn_cls_loss(
                cls_pred, cls_targets, cls_targets.expand_dims(-1) >= 0) * cls_targets.size / \
                         num_rcnn_pos
            rcnn_loss2 = self.rcnn_box_loss(box_pred, box_targets, box_masks) * box_pred.size / \
                         num_rcnn_pos
            rcnn_loss = rcnn_loss1 + rcnn_loss2
            # overall losses
            total_loss = rpn_loss.sum() * self.mix_ratio + rcnn_loss.sum(
            ) * self.mix_ratio

            rpn_loss1_metric = rpn_loss1.mean() * self.mix_ratio
            rpn_loss2_metric = rpn_loss2.mean() * self.mix_ratio
            rcnn_loss1_metric = rcnn_loss1.mean() * self.mix_ratio
            rcnn_loss2_metric = rcnn_loss2.mean() * self.mix_ratio
            rpn_acc_metric = [[rpn_cls_targets, rpn_cls_targets >= 0],
                              [rpn_score]]
            rpn_l1_loss_metric = [[rpn_box_targets, rpn_box_masks], [rpn_box]]
            rcnn_acc_metric = [[cls_targets], [cls_pred]]
            rcnn_l1_loss_metric = [[box_targets, box_masks], [box_pred]]

            if self.amp_enabled:
                with amp.scale_loss(total_loss,
                                    self._optimizer) as scaled_losses:
                    autograd.backward(scaled_losses)
            else:
                total_loss.backward()

            # rpn_gt_recalls = []
            # for i in range(gt_label.shape[0]):
            #     # 如果两个box面积都是0,iou是0
            #     iou = contrib.ndarray.box_iou(roi[i], gt_box[i], format='corner')
            #     iou_gt_max = nd.max(iou, axis=0)
            #     _gt_label = nd.squeeze(gt_label[i])
            #     iou_gt_max = contrib.nd.boolean_mask(iou_gt_max, _gt_label != -1)
            #     rpn_gt_recall = nd.mean(iou_gt_max >= 0.5)
            #     rpn_gt_recalls.append(rpn_gt_recall)
            # rpn_gt_recall = sum(rpn_gt_recalls) / len(rpn_gt_recalls)
            rpn_gt_recall = nd.zeros((1, ), ctx=roi.context)

        return rpn_loss1_metric, rpn_loss2_metric, rcnn_loss1_metric, rcnn_loss2_metric, rpn_gt_recall, \
               rpn_acc_metric, rpn_l1_loss_metric, rcnn_acc_metric, rcnn_l1_loss_metric
Exemple #19
0
    def forward(self, x):
        data, label, rpn_cls_targets, rpn_box_targets, rpn_box_masks = x
        with autograd.record():
            gt_box = label[:, :, :4]
            gt_label = label[:, :, 4:5]
            cls_preds, box_preds, roi, samples, matches, rpn_score, rpn_box, anchors = self.net(data, gt_box)
            cls_targets, box_targets, box_masks = self.net.target_generator(roi, samples,
                                                                            matches, gt_label, gt_box)
            """
            data:  (1, 3, h, w)
            label: (1, num_obj, 6)
            
            # rpn
            roi: (1, 128, 4)
            samples: (1, 128)
            matches: (1, 128)
            
            rpn_cls_targets: (1, N)                  rpn_score:  (1, N, 1)
            rpn_box_targets: (1, N, 4)               rpn_box:  (1, N, 4)
            rpn_box_masks:   (1, N, 4)
    
            # rcnn
            cls_targets:   (1, 128)                   cls_preds: (1, 128, num_cls + 1)
            box_targets:   (1, 128, num_cls, 4)       box_preds: (1, 128, num_cls, 4)
            rcnn box mask: (1, 128, num_cls, 4)
            """

            # loss of rpn
            rpn_score = rpn_score.squeeze(axis=-1)
            num_rpn_pos = (rpn_cls_targets >= 0).sum()
            rpn_loss_cls = self.rpn_cls_loss(rpn_score, rpn_cls_targets, rpn_cls_targets >= 0) * \
                           rpn_cls_targets.size / num_rpn_pos
            rpn_loss_box = self.rpn_box_loss(rpn_box, rpn_box_targets, rpn_box_masks) * \
                           rpn_box_targets.size / num_rpn_pos
            rpn_loss = rpn_loss_cls + rpn_loss_box

            # loss of rcnn
            num_rcnn_pos = (cls_targets >= 0).sum()
            rcnn_loss_cls = self.rcnn_cls_loss(cls_preds, cls_targets, cls_targets >= 0) * \
                            cls_targets.size / cls_targets.shape[0] / num_rcnn_pos
            rcnn_loss_box = self.rcnn_box_loss(box_preds, box_targets, box_masks) * \
                            box_targets.size / box_targets.shape[0] / num_rcnn_pos
            rcnn_loss = rcnn_loss_cls + rcnn_loss_box

            # overall loss
            total_loss = rpn_loss.sum() * self.mix_ratio + rcnn_loss.sum() * self.mix_ratio

            rpn_cls_loss_metric = rpn_loss_cls.sum() * self.mix_ratio
            rpn_box_loss_metric = rpn_loss_box.sum() * self.mix_ratio
            rcnn_cls_loss_metric = rcnn_loss_cls.sum() * self.mix_ratio
            rcnn_box_loss_metric = rcnn_loss_box.sum() * self.mix_ratio

            rpn_acc_metric = [[rpn_cls_targets, rpn_cls_targets >= 0], [rpn_score]]
            rpn_l1_loss_metric = [[rpn_box_targets, rpn_box_masks], [rpn_box]]
            rcnn_acc_metric = [[cls_targets], [cls_preds]]
            rcnn_l1_loss_metric = [[box_targets, box_masks], [box_preds]]

        if args.amp:
            with amp.scale_loss(total_loss, self.optimizer) as scaled_loss:
                scaled_loss.backward()
        else:
            total_loss.backward()

        return rpn_cls_loss_metric, rpn_box_loss_metric, rcnn_cls_loss_metric, rcnn_box_loss_metric, \
               rpn_acc_metric, rpn_l1_loss_metric, rcnn_acc_metric, rcnn_l1_loss_metric
Exemple #20
0
def train():
    """Training function."""
    segment = 'train'  #if not args.debug else 'dev'
    log.info('Loading %s data...', segment)
    if version_2:
        train_data = SQuAD(segment, version='2.0')
    else:
        train_data = SQuAD(segment, version='1.1')
    if args.debug:
        sampled_data = [train_data[i] for i in range(0, 10000)]
        train_data = mx.gluon.data.SimpleDataset(sampled_data)
    log.info('Number of records in Train data:{}'.format(len(train_data)))
    train_data_transform = preprocess_dataset(
        tokenizer,
        train_data,
        max_seq_length=max_seq_length,
        doc_stride=doc_stride,
        max_query_length=max_query_length,
        input_features=True)

    log.info('The number of examples after preprocessing:{}'.format(
        len(train_data_transform)))

    sampler = nlp.data.SplitSampler(len(train_data_transform),
                                    num_parts=size,
                                    part_index=rank,
                                    even_size=True)
    num_train_examples = len(sampler)
    train_dataloader = mx.gluon.data.DataLoader(train_data_transform,
                                                batchify_fn=batchify_fn,
                                                batch_size=batch_size,
                                                num_workers=4,
                                                sampler=sampler)

    log.info('Start Training')

    optimizer_params = {'learning_rate': lr}
    param_dict = net.collect_params()
    if args.comm_backend == 'horovod':
        trainer = hvd.DistributedTrainer(param_dict, optimizer,
                                         optimizer_params)
    else:
        trainer = mx.gluon.Trainer(param_dict,
                                   optimizer,
                                   optimizer_params,
                                   update_on_kvstore=False)
    if args.dtype == 'float16':
        amp.init_trainer(trainer)

    step_size = batch_size * accumulate if accumulate else batch_size
    num_train_steps = int(num_train_examples / step_size * args.epochs)
    if args.training_steps:
        num_train_steps = args.training_steps

    num_warmup_steps = int(num_train_steps * warmup_ratio)

    def set_new_lr(step_num, batch_id):
        """set new learning rate"""
        # set grad to zero for gradient accumulation
        if accumulate:
            if batch_id % accumulate == 0:
                step_num += 1
        else:
            step_num += 1
        # learning rate schedule
        # Notice that this learning rate scheduler is adapted from traditional linear learning
        # rate scheduler where step_num >= num_warmup_steps, new_lr = 1 - step_num/num_train_steps
        if step_num < num_warmup_steps:
            new_lr = lr * step_num / num_warmup_steps
        else:
            offset = (step_num - num_warmup_steps) * lr / \
                (num_train_steps - num_warmup_steps)
            new_lr = lr - offset
        trainer.set_learning_rate(new_lr)
        return step_num

    # Do not apply weight decay on LayerNorm and bias terms
    for _, v in net.collect_params('.*beta|.*gamma|.*bias').items():
        v.wd_mult = 0.0
    # Collect differentiable parameters
    params = [p for p in param_dict.values() if p.grad_req != 'null']

    # Set grad_req if gradient accumulation is required
    if accumulate:
        for p in params:
            p.grad_req = 'add'
    net.collect_params().zero_grad()

    epoch_tic = time.time()

    total_num = 0
    log_num = 0
    batch_id = 0
    step_loss = 0.0
    tic = time.time()
    step_num = 0

    tic = time.time()
    while step_num < num_train_steps:
        for _, data in enumerate(train_dataloader):
            # set new lr
            step_num = set_new_lr(step_num, batch_id)
            # forward and backward
            _, inputs, token_types, valid_length, start_label, end_label = data
            num_labels = len(inputs)
            log_num += num_labels
            total_num += num_labels

            with mx.autograd.record():
                out = net(inputs.as_in_context(ctx),
                          token_types.as_in_context(ctx),
                          valid_length.as_in_context(ctx).astype('float32'))

                loss = loss_function(out, [
                    start_label.as_in_context(ctx).astype('float32'),
                    end_label.as_in_context(ctx).astype('float32')
                ]).sum() / num_labels

                if accumulate:
                    loss = loss / accumulate
                if args.dtype == 'float16':
                    with amp.scale_loss(loss, trainer) as l:
                        mx.autograd.backward(l)
                        norm_clip = 1.0 * size * trainer._amp_loss_scaler.loss_scale
                else:
                    mx.autograd.backward(loss)
                    norm_clip = 1.0 * size

            # update
            if not accumulate or (batch_id + 1) % accumulate == 0:
                trainer.allreduce_grads()
                nlp.utils.clip_grad_global_norm(params, norm_clip)
                trainer.update(1)
                if accumulate:
                    param_dict.zero_grad()

            if args.comm_backend == 'horovod':
                step_loss += hvd.allreduce(loss, average=True).asscalar()
            else:
                step_loss += loss.asscalar()

            if (batch_id + 1) % log_interval == 0:
                toc = time.time()
                log.info('Batch: {}/{}, Loss={:.4f}, lr={:.7f} '
                         'Thoughput={:.2f} samples/s'.format(
                             batch_id % len(train_dataloader),
                             len(train_dataloader), step_loss / log_interval,
                             trainer.learning_rate, log_num / (toc - tic)))
                tic = time.time()
                step_loss = 0.0
                log_num = 0

            if step_num >= num_train_steps:
                break
            batch_id += 1

        log.info('Finish training step: %d', step_num)
        epoch_toc = time.time()
        log.info('Time cost={:.2f} s, Thoughput={:.2f} samples/s'.format(
            epoch_toc - epoch_tic, total_num / (epoch_toc - epoch_tic)))

    if rank == 0:
        net.save_parameters(os.path.join(output_dir, 'net.params'))
Exemple #21
0
    def _train_loop(self, train_data, val_data, train_eval_data):
        # fix seed for mxnet, numpy and python builtin random generator.
        gutils.random.seed(self._cfg.train.seed)
        # loss and metric
        mbox_loss = SSDMultiBoxLoss()
        ce_metric = mx.metric.Loss('CrossEntropy')
        smoothl1_metric = mx.metric.Loss('SmoothL1')

        # lr decay policy
        lr_decay = float(self._cfg.train.lr_decay)
        lr_steps = sorted([float(ls) for ls in self._cfg.train.lr_decay_epoch])

        self._logger.info('Start training from [Epoch %d]',
                          max(self._cfg.train.start_epoch, self.epoch))

        self.net.collect_params().reset_ctx(self.ctx)
        for self.epoch in range(max(self._cfg.train.start_epoch, self.epoch),
                                self._cfg.train.epochs):
            epoch = self.epoch
            while lr_steps and epoch >= lr_steps[0]:
                new_lr = self.trainer.learning_rate * lr_decay
                lr_steps.pop(0)
                self.trainer.set_learning_rate(new_lr)
                self._logger.info("[Epoch {}] Set learning rate to {}".format(
                    epoch, new_lr))
            ce_metric.reset()
            smoothl1_metric.reset()
            tic = time.time()
            btic = time.time()
            self.net.hybridize(static_alloc=True, static_shape=True)

            for i, batch in enumerate(train_data):
                if self._cfg.train.dali:
                    # dali iterator returns a mxnet.io.DataBatch
                    data = [d.data[0] for d in batch]
                    box_targets = [d.label[0] for d in batch]
                    cls_targets = [
                        nd.cast(d.label[1], dtype='float32') for d in batch
                    ]
                else:
                    data = gluon.utils.split_and_load(batch[0],
                                                      ctx_list=self.ctx,
                                                      batch_axis=0,
                                                      even_split=False)
                    cls_targets = gluon.utils.split_and_load(batch[1],
                                                             ctx_list=self.ctx,
                                                             batch_axis=0,
                                                             even_split=False)
                    box_targets = gluon.utils.split_and_load(batch[2],
                                                             ctx_list=self.ctx,
                                                             batch_axis=0,
                                                             even_split=False)

                with autograd.record():
                    cls_preds = []
                    box_preds = []
                    for x in data:
                        cls_pred, box_pred, _ = self.net(x)
                        cls_preds.append(cls_pred)
                        box_preds.append(box_pred)
                    sum_loss, cls_loss, box_loss = mbox_loss(
                        cls_preds, box_preds, cls_targets, box_targets)
                    if self._cfg.ssd.amp:
                        with amp.scale_loss(sum_loss,
                                            self.trainer) as scaled_loss:
                            autograd.backward(scaled_loss)
                    else:
                        autograd.backward(sum_loss)
                # since we have already normalized the loss, we don't want to normalize
                # by batch-size anymore
                self.trainer.step(1)

                if not self._cfg.horovod or hvd.rank() == 0:
                    local_batch_size = int(
                        self._cfg.train.batch_size //
                        (hvd.size() if self._cfg.horovod else 1))
                    ce_metric.update(0,
                                     [l * local_batch_size for l in cls_loss])
                    smoothl1_metric.update(
                        0, [l * local_batch_size for l in box_loss])
                    if self._cfg.train.log_interval and not (
                            i + 1) % self._cfg.train.log_interval:
                        name1, loss1 = ce_metric.get()
                        name2, loss2 = smoothl1_metric.get()
                        self._logger.info(
                            '[Epoch %d][Batch %d], Speed: %f samples/sec, %s=%f, %s=%f',
                            epoch, i,
                            self._cfg.train.batch_size / (time.time() - btic),
                            name1, loss1, name2, loss2)
                    btic = time.time()

            if not self._cfg.horovod or hvd.rank() == 0:
                name1, loss1 = ce_metric.get()
                name2, loss2 = smoothl1_metric.get()
                self._logger.info('[Epoch %d] Training cost: %f, %s=%f, %s=%f',
                                  epoch, (time.time() - tic), name1, loss1,
                                  name2, loss2)
                if (epoch % self._cfg.valid.val_interval == 0) or \
                    (self._cfg.save_interval and epoch % self._cfg.save_interval == 0):
                    # consider reduce the frequency of validation to save time
                    map_name, mean_ap = self._evaluate(val_data)
                    val_msg = '\n'.join([
                        '{}={}'.format(k, v)
                        for k, v in zip(map_name, mean_ap)
                    ])
                    self._logger.info('[Epoch %d] Validation: \n%s', epoch,
                                      str(val_msg))
                    current_map = float(mean_ap[-1])
                    if current_map > self._best_map:
                        cp_name = os.path.join(self._logdir,
                                               'best_checkpoint.pkl')
                        self._logger.info(
                            '[Epoch %d] Current best map: %f vs previous %f, saved to %s',
                            self.epoch, current_map, self._best_map, cp_name)
                        self.save(cp_name)
                        self._best_map = current_map
                if self._reporter:
                    self._reporter(epoch=epoch, map_reward=current_map)
            self._time_elapsed += time.time() - btic
        # map on train data
        map_name, mean_ap = self._evaluate(train_eval_data)
        return {
            'train_map': float(mean_ap[-1]),
            'valid_map': self._best_map,
            'time': self._time_elapsed
        }
Exemple #22
0
    def _train_loop(self, train_data, val_data, train_eval_data):
        # fix seed for mxnet, numpy and python builtin random generator.
        gutils.random.seed(self._cfg.train.seed)

        # metrics
        obj_metrics = mx.metric.Loss('ObjLoss')
        center_metrics = mx.metric.Loss('BoxCenterLoss')
        scale_metrics = mx.metric.Loss('BoxScaleLoss')
        cls_metrics = mx.metric.Loss('ClassLoss')
        trainer = self.trainer
        self._logger.info('Start training from [Epoch %d]', max(self._cfg.train.start_epoch, self.epoch))
        for self.epoch in range(max(self._cfg.train.start_epoch, self.epoch), self._cfg.train.epochs):
            epoch = self.epoch
            tic = time.time()
            btic = time.time()
            if self._cfg.train.mixup:
                # TODO(zhreshold): more elegant way to control mixup during runtime
                try:
                    train_data._dataset.set_mixup(np.random.beta, 1.5, 1.5)
                except AttributeError:
                    train_data._dataset._data.set_mixup(np.random.beta, 1.5, 1.5)
                if epoch >= self._cfg.train.epochs - self._cfg.train.no_mixup_epochs:
                    try:
                        train_data._dataset.set_mixup(None)
                    except AttributeError:
                        train_data._dataset._data.set_mixup(None)

            mx.nd.waitall()
            self.net.hybridize()
            for i, batch in enumerate(train_data):
                data = gluon.utils.split_and_load(batch[0], ctx_list=self.ctx, batch_axis=0, even_split=False)
                # objectness, center_targets, scale_targets, weights, class_targets
                fixed_targets = [gluon.utils.split_and_load(batch[it], ctx_list=self.ctx,
                                                            batch_axis=0, even_split=False) for it in range(1, 6)]
                gt_boxes = gluon.utils.split_and_load(batch[6], ctx_list=self.ctx, batch_axis=0, even_split=False)
                sum_losses = []
                obj_losses = []
                center_losses = []
                scale_losses = []
                cls_losses = []
                with autograd.record():
                    for ix, x in enumerate(data):
                        obj_loss, center_loss, scale_loss, cls_loss = self.net(x, gt_boxes[ix],
                                                                               *[ft[ix] for ft in fixed_targets])
                        sum_losses.append(obj_loss + center_loss + scale_loss + cls_loss)
                        obj_losses.append(obj_loss)
                        center_losses.append(center_loss)
                        scale_losses.append(scale_loss)
                        cls_losses.append(cls_loss)
                    if self._cfg.yolo3.amp:
                        with amp.scale_loss(sum_losses, trainer) as scaled_loss:
                            autograd.backward(scaled_loss)
                    else:
                        autograd.backward(sum_losses)
                trainer.step(self.batch_size)
                if (not self._cfg.horovod or hvd.rank() == 0):
                    obj_metrics.update(0, obj_losses)
                    center_metrics.update(0, center_losses)
                    scale_metrics.update(0, scale_losses)
                    cls_metrics.update(0, cls_losses)
                    if self._cfg.train.log_interval and not (i + 1) % self._cfg.train.log_interval:
                        name1, loss1 = obj_metrics.get()
                        name2, loss2 = center_metrics.get()
                        name3, loss3 = scale_metrics.get()
                        name4, loss4 = cls_metrics.get()
                        self._logger.info(
                            '[Epoch {}][Batch {}], LR: {:.2E}, Speed: {:.3f} samples/sec,'
                            ' {}={:.3f}, {}={:.3f}, {}={:.3f}, {}={:.3f}'.format(
                                epoch, i, trainer.learning_rate, self._cfg.train.batch_size / (time.time() - btic),
                                name1, loss1, name2, loss2, name3, loss3, name4, loss4))
                    btic = time.time()

            if (not self._cfg.horovod or hvd.rank() == 0):
                name1, loss1 = obj_metrics.get()
                name2, loss2 = center_metrics.get()
                name3, loss3 = scale_metrics.get()
                name4, loss4 = cls_metrics.get()
                self._logger.info('[Epoch {}] Training cost: {:.3f}, {}={:.3f}, {}={:.3f}, {}={:.3f}, {}={:.3f}'.format(
                    epoch, (time.time() - tic), name1, loss1, name2, loss2, name3, loss3, name4, loss4))
                if not (epoch + 1) % self._cfg.valid.val_interval:
                    # consider reduce the frequency of validation to save time
                    map_name, mean_ap = self._evaluate(val_data)
                    val_msg = '\n'.join(['{}={}'.format(k, v) for k, v in zip(map_name, mean_ap)])
                    self._logger.info('[Epoch {}] Validation: \n{}'.format(epoch, val_msg))
                    current_map = float(mean_ap[-1])
                    if current_map > self._best_map:
                        cp_name = os.path.join(self._logdir, 'best_checkpoint.pkl')
                        self._logger.info('[Epoch %d] Current best map: %f vs previous %f, saved to %s',
                                          self.epoch, current_map, self._best_map, cp_name)
                        self.save(cp_name)
                        self._best_map = current_map
                if self._reporter:
                    self._reporter(epoch=epoch, map_reward=current_map)
            self._time_elapsed += time.time() - btic

        # map on train data
        map_name, mean_ap = self._evaluate(train_eval_data)
        return {'train_map': float(mean_ap[-1]), 'valid_map': self._best_map, 'time': self._time_elapsed}
Exemple #23
0
def train(metric):
    """Training function."""
    if not only_inference:
        logging.info('Now we are doing BERT classification training on %s!',
                     ctx)

    all_model_params = model.collect_params()
    optimizer_params = {'learning_rate': lr, 'epsilon': epsilon, 'wd': 0.01}
    try:
        trainer = gluon.Trainer(all_model_params,
                                args.optimizer,
                                optimizer_params,
                                update_on_kvstore=False)
    except ValueError as e:
        print(e)
        warnings.warn(
            'AdamW optimizer is not found. Please consider upgrading to '
            'mxnet>=1.5.0. Now the original Adam optimizer is used instead.')
        trainer = gluon.Trainer(all_model_params,
                                'adam',
                                optimizer_params,
                                update_on_kvstore=False)
    if args.dtype == 'float16':
        amp.init_trainer(trainer)

    step_size = batch_size * accumulate if accumulate else batch_size
    num_train_steps = int(num_train_examples / step_size * args.epochs)
    warmup_ratio = args.warmup_ratio
    num_warmup_steps = int(num_train_steps * warmup_ratio)
    step_num = 0

    # Do not apply weight decay on LayerNorm and bias terms
    for _, v in model.collect_params('.*beta|.*gamma|.*bias').items():
        v.wd_mult = 0.0
    # Collect differentiable parameters
    params = [p for p in all_model_params.values() if p.grad_req != 'null']

    # Set grad_req if gradient accumulation is required
    if accumulate and accumulate > 1:
        for p in params:
            p.grad_req = 'add'
    # track best eval score
    metric_history = []

    tic = time.time()
    for epoch_id in range(args.epochs):
        if not only_inference:
            metric.reset()
            step_loss = 0
            tic = time.time()
            all_model_params.zero_grad()

            for batch_id, seqs in enumerate(train_data):
                # learning rate schedule
                if step_num < num_warmup_steps:
                    new_lr = lr * step_num / num_warmup_steps
                else:
                    non_warmup_steps = step_num - num_warmup_steps
                    offset = non_warmup_steps / (num_train_steps -
                                                 num_warmup_steps)
                    new_lr = lr - offset * lr
                trainer.set_learning_rate(new_lr)

                # forward and backward
                with mx.autograd.record():
                    input_ids, valid_length, type_ids, label = seqs
                    out = model(
                        input_ids.as_in_context(ctx),
                        type_ids.as_in_context(ctx),
                        valid_length.astype('float32').as_in_context(ctx))
                    ls = loss_function(out, label.as_in_context(ctx)).mean()
                    if args.dtype == 'float16':
                        with amp.scale_loss(ls, trainer) as scaled_loss:
                            mx.autograd.backward(scaled_loss)
                    else:
                        ls.backward()

                # update
                if not accumulate or (batch_id + 1) % accumulate == 0:
                    trainer.allreduce_grads()
                    nlp.utils.clip_grad_global_norm(params, 1)
                    trainer.update(accumulate if accumulate else 1)
                    step_num += 1
                    if accumulate and accumulate > 1:
                        # set grad to zero for gradient accumulation
                        all_model_params.zero_grad()

                step_loss += ls.asscalar()
                metric.update([label], [out])
                if (batch_id + 1) % (args.log_interval) == 0:
                    log_train(batch_id, len(train_data), metric, step_loss,
                              args.log_interval, epoch_id,
                              trainer.learning_rate)
                    step_loss = 0
            mx.nd.waitall()

        # inference on dev data
        for segment, dev_data in dev_data_list:
            metric_nm, metric_val = evaluate(dev_data, metric, segment)
            metric_history.append((epoch_id, metric_nm, metric_val))

        if not only_inference:
            # save params
            ckpt_name = 'model_bert_{0}_{1}.params'.format(task_name, epoch_id)
            params_saved = os.path.join(output_dir, ckpt_name)

            nlp.utils.save_parameters(model, params_saved)
            logging.info('params saved in: %s', params_saved)
            toc = time.time()
            logging.info('Time cost=%.2fs', toc - tic)
            tic = toc

    if not only_inference:
        # we choose the best model based on metric[0],
        # assuming higher score stands for better model quality
        metric_history.sort(key=lambda x: x[2][0], reverse=True)
        epoch_id, metric_nm, metric_val = metric_history[0]
        ckpt_name = 'model_bert_{0}_{1}.params'.format(task_name, epoch_id)
        params_saved = os.path.join(output_dir, ckpt_name)
        nlp.utils.load_parameters(model, params_saved)
        metric_str = 'Best model at epoch {}. Validation metrics:'.format(
            epoch_id)
        metric_str += ','.join([i + ':%.4f' for i in metric_nm])
        logging.info(metric_str, *metric_val)

    # inference on test data
    for segment, test_data in test_data_list:
        test(test_data, segment)
    def train(ctx):
        if isinstance(ctx, mx.Context):
            ctx = [ctx]

        if opt.no_wd:
            for k, v in net.collect_params('.*beta|.*gamma|.*bias').items():
                v.wd_mult = 0.0

        if opt.partial_bn:
            train_patterns = None
            if 'inceptionv3' in opt.model:
                train_patterns = '.*weight|.*bias|inception30_batchnorm0_gamma|inception30_batchnorm0_beta|inception30_batchnorm0_running_mean|inception30_batchnorm0_running_var'
            else:
                logger.info(
                    'Current model does not support partial batch normalization.'
                )

            if opt.kvstore is not None:
                trainer = gluon.Trainer(net.collect_params(train_patterns),
                                        optimizer,
                                        optimizer_params,
                                        kvstore=kv,
                                        update_on_kvstore=False)
            else:
                trainer = gluon.Trainer(net.collect_params(train_patterns),
                                        optimizer,
                                        optimizer_params,
                                        update_on_kvstore=False)
        else:
            if opt.kvstore is not None:
                trainer = gluon.Trainer(net.collect_params(),
                                        optimizer,
                                        optimizer_params,
                                        kvstore=kv,
                                        update_on_kvstore=False)
            else:
                trainer = gluon.Trainer(net.collect_params(),
                                        optimizer,
                                        optimizer_params,
                                        update_on_kvstore=False)

        if opt.accumulate > 1:
            params = [
                p for p in net.collect_params().values()
                if p.grad_req != 'null'
            ]
            for p in params:
                p.grad_req = 'add'

        if opt.resume_states is not '':
            trainer.load_states(opt.resume_states)

        if opt.use_amp:
            amp.init_trainer(trainer)

        L = gluon.loss.SoftmaxCrossEntropyLoss()

        best_val_score = 0
        lr_decay_count = 0

        for epoch in range(opt.resume_epoch, opt.num_epochs):
            tic = time.time()
            train_metric.reset()
            btic = time.time()
            num_train_iter = len(train_data)
            train_loss_epoch = 0
            train_loss_iter = 0

            for i, batch in enumerate(train_data):
                data, label = batch_fn(batch, ctx)

                with ag.record():
                    outputs = []
                    for _, X in enumerate(data):
                        X = X.reshape((-1, ) + X.shape[2:])
                        pred = net(X.astype(opt.dtype, copy=False))
                        outputs.append(pred)
                    loss = [
                        L(yhat, y.astype(opt.dtype, copy=False))
                        for yhat, y in zip(outputs, label)
                    ]

                    if opt.use_amp:
                        with amp.scale_loss(loss, trainer) as scaled_loss:
                            ag.backward(scaled_loss)
                    else:
                        ag.backward(loss)

                if opt.accumulate > 1 and (i + 1) % opt.accumulate == 0:
                    if opt.kvstore is not None:
                        trainer.step(batch_size * kv.num_workers *
                                     opt.accumulate)
                    else:
                        trainer.step(batch_size * opt.accumulate)
                        net.collect_params().zero_grad()
                else:
                    if opt.kvstore is not None:
                        trainer.step(batch_size * kv.num_workers)
                    else:
                        trainer.step(batch_size)

                train_metric.update(label, outputs)
                train_loss_iter = sum([l.mean().asscalar()
                                       for l in loss]) / len(loss)
                train_loss_epoch += train_loss_iter

                train_metric_name, train_metric_score = train_metric.get()
                sw.add_scalar(tag='train_acc_top1_iter',
                              value=train_metric_score * 100,
                              global_step=epoch * num_train_iter + i)
                sw.add_scalar(tag='train_loss_iter',
                              value=train_loss_iter,
                              global_step=epoch * num_train_iter + i)
                sw.add_scalar(tag='learning_rate_iter',
                              value=trainer.learning_rate,
                              global_step=epoch * num_train_iter + i)

                if opt.log_interval and not (i + 1) % opt.log_interval:
                    logger.info(
                        'Epoch[%03d] Batch [%04d]/[%04d]\tSpeed: %f samples/sec\t %s=%f\t loss=%f\t lr=%f'
                        % (epoch, i, num_train_iter,
                           batch_size * opt.log_interval /
                           (time.time() - btic), train_metric_name,
                           train_metric_score * 100, train_loss_epoch /
                           (i + 1), trainer.learning_rate))
                    btic = time.time()

            train_metric_name, train_metric_score = train_metric.get()
            throughput = int(batch_size * i / (time.time() - tic))
            mx.ndarray.waitall()

            if opt.kvstore is not None and epoch == opt.resume_epoch:
                kv.init(111111, nd.zeros(1))
                kv.init(555555, nd.zeros(1))
                kv.init(999999, nd.zeros(1))

            if opt.kvstore is not None:
                acc_top1_val, acc_top5_val, loss_val = test(ctx, val_data, kv)
            else:
                acc_top1_val, acc_top5_val, loss_val = test(ctx, val_data)

            logger.info('[Epoch %03d] training: %s=%f\t loss=%f' %
                        (epoch, train_metric_name, train_metric_score * 100,
                         train_loss_epoch / num_train_iter))
            logger.info('[Epoch %03d] speed: %d samples/sec\ttime cost: %f' %
                        (epoch, throughput, time.time() - tic))
            logger.info(
                '[Epoch %03d] validation: acc-top1=%f acc-top5=%f loss=%f' %
                (epoch, acc_top1_val * 100, acc_top5_val * 100, loss_val))

            sw.add_scalar(tag='train_loss_epoch',
                          value=train_loss_epoch / num_train_iter,
                          global_step=epoch)
            sw.add_scalar(tag='val_loss_epoch',
                          value=loss_val,
                          global_step=epoch)
            sw.add_scalar(tag='val_acc_top1_epoch',
                          value=acc_top1_val * 100,
                          global_step=epoch)

            if acc_top1_val > best_val_score:
                best_val_score = acc_top1_val
                net.save_parameters('%s/%.4f-%s-%s-%03d-best.params' %
                                    (opt.save_dir, best_val_score, opt.dataset,
                                     model_name, epoch))
                trainer.save_states('%s/%.4f-%s-%s-%03d-best.states' %
                                    (opt.save_dir, best_val_score, opt.dataset,
                                     model_name, epoch))
            else:
                if opt.save_frequency and opt.save_dir and (
                        epoch + 1) % opt.save_frequency == 0:
                    net.save_parameters(
                        '%s/%s-%s-%03d.params' %
                        (opt.save_dir, opt.dataset, model_name, epoch))
                    trainer.save_states(
                        '%s/%s-%s-%03d.states' %
                        (opt.save_dir, opt.dataset, model_name, epoch))

        # save the last model
        net.save_parameters(
            '%s/%s-%s-%03d.params' %
            (opt.save_dir, opt.dataset, model_name, opt.num_epochs - 1))
        trainer.save_states(
            '%s/%s-%s-%03d.states' %
            (opt.save_dir, opt.dataset, model_name, opt.num_epochs - 1))
Exemple #25
0
def train(net, train_data, val_data, eval_metric, ctx, args):
    """Training pipeline"""
    net.collect_params().reset_ctx(ctx)

    if args.horovod:
        hvd.broadcast_parameters(net.collect_params(), root_rank=0)
        trainer = hvd.DistributedTrainer(
                        net.collect_params(), 'sgd',
                        {'learning_rate': args.lr, 'wd': args.wd, 'momentum': args.momentum})
    else:
        trainer = gluon.Trainer(
                    net.collect_params(), 'sgd',
                    {'learning_rate': args.lr, 'wd': args.wd, 'momentum': args.momentum},
                    update_on_kvstore=(False if args.amp else None))

    if args.amp:
        amp.init_trainer(trainer)

    # lr decay policy
    lr_decay = float(args.lr_decay)
    lr_steps = sorted([float(ls) for ls in args.lr_decay_epoch.split(',') if ls.strip()])

    mbox_loss = gcv.loss.SSDMultiBoxLoss()
    ce_metric = mx.metric.Loss('CrossEntropy')
    smoothl1_metric = mx.metric.Loss('SmoothL1')

    # set up logger
    logging.basicConfig()
    logger = logging.getLogger()
    logger.setLevel(logging.INFO)
    log_file_path = args.save_prefix + '_train.log'
    log_dir = os.path.dirname(log_file_path)
    if log_dir and not os.path.exists(log_dir):
        os.makedirs(log_dir)
    fh = logging.FileHandler(log_file_path)
    logger.addHandler(fh)
    logger.info(args)
    logger.info('Start training from [Epoch {}]'.format(args.start_epoch))
    best_map = [0]

    for epoch in range(args.start_epoch, args.epochs):
        while lr_steps and epoch >= lr_steps[0]:
            new_lr = trainer.learning_rate * lr_decay
            lr_steps.pop(0)
            trainer.set_learning_rate(new_lr)
            logger.info("[Epoch {}] Set learning rate to {}".format(epoch, new_lr))
        ce_metric.reset()
        smoothl1_metric.reset()
        tic = time.time()
        btic = time.time()
        net.hybridize(static_alloc=True, static_shape=True)

        for i, batch in enumerate(train_data):
            if args.dali:
                # dali iterator returns a mxnet.io.DataBatch
                data = [d.data[0] for d in batch]
                box_targets = [d.label[0] for d in batch]
                cls_targets = [nd.cast(d.label[1], dtype='float32') for d in batch]

            else:
                data = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0)
                cls_targets = gluon.utils.split_and_load(batch[1], ctx_list=ctx, batch_axis=0)
                box_targets = gluon.utils.split_and_load(batch[2], ctx_list=ctx, batch_axis=0)

            with autograd.record():
                cls_preds = []
                box_preds = []
                for x in data:
                    cls_pred, box_pred, _ = net(x)
                    cls_preds.append(cls_pred)
                    box_preds.append(box_pred)
                sum_loss, cls_loss, box_loss = mbox_loss(
                    cls_preds, box_preds, cls_targets, box_targets)
                if args.amp:
                    with amp.scale_loss(sum_loss, trainer) as scaled_loss:
                        autograd.backward(scaled_loss)
                else:
                    autograd.backward(sum_loss)
            # since we have already normalized the loss, we don't want to normalize
            # by batch-size anymore
            trainer.step(1)

            if (not args.horovod or hvd.rank() == 0):
                local_batch_size = int(args.batch_size // (hvd.size() if args.horovod else 1))
                ce_metric.update(0, [l * local_batch_size for l in cls_loss])
                smoothl1_metric.update(0, [l * local_batch_size for l in box_loss])
                if args.log_interval and not (i + 1) % args.log_interval:
                    name1, loss1 = ce_metric.get()
                    name2, loss2 = smoothl1_metric.get()
                    logger.info('[Epoch {}][Batch {}], Speed: {:.3f} samples/sec, {}={:.3f}, {}={:.3f}'.format(
                        epoch, i, args.batch_size/(time.time()-btic), name1, loss1, name2, loss2))
                btic = time.time()

        if (not args.horovod or hvd.rank() == 0):
            name1, loss1 = ce_metric.get()
            name2, loss2 = smoothl1_metric.get()
            logger.info('[Epoch {}] Training cost: {:.3f}, {}={:.3f}, {}={:.3f}'.format(
                epoch, (time.time()-tic), name1, loss1, name2, loss2))
            if (epoch % args.val_interval == 0) or (args.save_interval and epoch % args.save_interval == 0):
                # consider reduce the frequency of validation to save time
                map_name, mean_ap = validate(net, val_data, ctx, eval_metric)
                val_msg = '\n'.join(['{}={}'.format(k, v) for k, v in zip(map_name, mean_ap)])
                logger.info('[Epoch {}] Validation: \n{}'.format(epoch, val_msg))
                current_map = float(mean_ap[-1])
            else:
                current_map = 0.
            save_params(net, best_map, current_map, epoch, args.save_interval, args.save_prefix)
Exemple #26
0
                         num_rcnn_pos
            rcnn_loss = rcnn_loss1 + rcnn_loss2
            # overall losses
            total_loss = rpn_loss.sum() * self.mix_ratio + rcnn_loss.sum() * self.mix_ratio

            rpn_loss1_metric = rpn_loss1.mean() * self.mix_ratio
            rpn_loss2_metric = rpn_loss2.mean() * self.mix_ratio
            rcnn_loss1_metric = rcnn_loss1.mean() * self.mix_ratio
            rcnn_loss2_metric = rcnn_loss2.mean() * self.mix_ratio
            rpn_acc_metric = [[rpn_cls_targets, rpn_cls_targets >= 0], [rpn_score]]
            rpn_l1_loss_metric = [[rpn_box_targets, rpn_box_masks], [rpn_box]]
            rcnn_acc_metric = [[cls_targets], [cls_pred]]
            rcnn_l1_loss_metric = [[box_targets, box_masks], [box_pred]]

            if args.amp:
                with amp.scale_loss(total_loss, self._optimizer) as scaled_losses:
                    autograd.backward(scaled_losses)
            else:
                total_loss.backward()

        return rpn_loss1_metric, rpn_loss2_metric, rcnn_loss1_metric, rcnn_loss2_metric, \
               rpn_acc_metric, rpn_l1_loss_metric, rcnn_acc_metric, rcnn_l1_loss_metric


def train(net, train_data, val_data, eval_metric, batch_size, ctx, args):
    """Training pipeline"""
    kv = mx.kvstore.create(args.kv_store)
    net.collect_params().setattr('grad_req', 'null')
    net.collect_train_params().setattr('grad_req', 'write')
<<<<<<< HEAD
Exemple #27
0
def train(net, train_data, val_data, eval_metric, ctx, args):
    """Training pipeline"""
    net.collect_params().reset_ctx(ctx)
    if args.no_wd:
        for k, v in net.collect_params('.*beta|.*gamma|.*bias').items():
            v.wd_mult = 0.0

    if args.label_smooth:
        net._target_generator._label_smooth = True

    if args.lr_decay_period > 0:
        lr_decay_epoch = list(
            range(args.lr_decay_period, args.epochs, args.lr_decay_period))
    else:
        lr_decay_epoch = [int(i) for i in args.lr_decay_epoch.split(',')]
    lr_decay_epoch = [e - args.warmup_epochs for e in lr_decay_epoch]
    num_batches = args.num_samples // args.batch_size
    lr_scheduler = LRSequential([
        LRScheduler('linear',
                    base_lr=0,
                    target_lr=args.lr,
                    nepochs=args.warmup_epochs,
                    iters_per_epoch=num_batches),
        LRScheduler(args.lr_mode,
                    base_lr=args.lr,
                    nepochs=args.epochs - args.warmup_epochs,
                    iters_per_epoch=num_batches,
                    step_epoch=lr_decay_epoch,
                    step_factor=args.lr_decay,
                    power=2),
    ])

    if args.horovod:
        hvd.broadcast_parameters(net.collect_params(), root_rank=0)
        trainer = hvd.DistributedTrainer(net.collect_params(), 'sgd', {
            'wd': args.wd,
            'momentum': args.momentum,
            'lr_scheduler': lr_scheduler
        })
    else:
        trainer = gluon.Trainer(
            net.collect_params(),
            'sgd', {
                'wd': args.wd,
                'momentum': args.momentum,
                'lr_scheduler': lr_scheduler
            },
            kvstore='local',
            update_on_kvstore=(False if args.amp else None))

    if args.amp:
        amp.init_trainer(trainer)

    # targets
    sigmoid_ce = gluon.loss.SigmoidBinaryCrossEntropyLoss(from_sigmoid=False)
    l1_loss = gluon.loss.L1Loss()

    # metrics
    obj_metrics = mx.metric.Loss('ObjLoss')
    center_metrics = mx.metric.Loss('BoxCenterLoss')
    scale_metrics = mx.metric.Loss('BoxScaleLoss')
    cls_metrics = mx.metric.Loss('ClassLoss')

    # set up logger
    logging.basicConfig()
    logger = logging.getLogger()
    logger.setLevel(logging.INFO)
    log_file_path = args.save_prefix + '_train.log'
    log_dir = os.path.dirname(log_file_path)
    if log_dir and not os.path.exists(log_dir):
        os.makedirs(log_dir)
    fh = logging.FileHandler(log_file_path)
    logger.addHandler(fh)
    logger.info(args)
    logger.info('Start training from [Epoch {}]'.format(args.start_epoch))
    best_map = [0]
    for epoch in range(args.start_epoch, args.epochs):
        if args.mixup:
            # TODO(zhreshold): more elegant way to control mixup during runtime
            try:
                train_data._dataset.set_mixup(np.random.beta, 1.5, 1.5)
            except AttributeError:
                train_data._dataset._data.set_mixup(np.random.beta, 1.5, 1.5)
            if epoch >= args.epochs - args.no_mixup_epochs:
                try:
                    train_data._dataset.set_mixup(None)
                except AttributeError:
                    train_data._dataset._data.set_mixup(None)

        tic = time.time()
        btic = time.time()
        mx.nd.waitall()
        net.hybridize()
        for i, batch in enumerate(train_data):
            data = gluon.utils.split_and_load(batch[0],
                                              ctx_list=ctx,
                                              batch_axis=0)
            # objectness, center_targets, scale_targets, weights, class_targets
            fixed_targets = [
                gluon.utils.split_and_load(batch[it],
                                           ctx_list=ctx,
                                           batch_axis=0) for it in range(1, 6)
            ]
            gt_boxes = gluon.utils.split_and_load(batch[6],
                                                  ctx_list=ctx,
                                                  batch_axis=0)
            sum_losses = []
            obj_losses = []
            center_losses = []
            scale_losses = []
            cls_losses = []
            with autograd.record():
                for ix, x in enumerate(data):
                    obj_loss, center_loss, scale_loss, cls_loss = net(
                        x, gt_boxes[ix], *[ft[ix] for ft in fixed_targets])
                    sum_losses.append(obj_loss + center_loss + scale_loss +
                                      cls_loss)
                    obj_losses.append(obj_loss)
                    center_losses.append(center_loss)
                    scale_losses.append(scale_loss)
                    cls_losses.append(cls_loss)
                if args.amp:
                    with amp.scale_loss(sum_losses, trainer) as scaled_loss:
                        autograd.backward(scaled_loss)
                else:
                    autograd.backward(sum_losses)
            trainer.step(batch_size)
            if (not args.horovod or hvd.rank() == 0):
                obj_metrics.update(0, obj_losses)
                center_metrics.update(0, center_losses)
                scale_metrics.update(0, scale_losses)
                cls_metrics.update(0, cls_losses)
                if args.log_interval and not (i + 1) % args.log_interval:
                    name1, loss1 = obj_metrics.get()
                    name2, loss2 = center_metrics.get()
                    name3, loss3 = scale_metrics.get()
                    name4, loss4 = cls_metrics.get()
                    logger.info(
                        '[Epoch {}][Batch {}], LR: {:.2E}, Speed: {:.3f} samples/sec, {}={:.3f}, {}={:.3f}, {}={:.3f}, {}={:.3f}'
                        .format(epoch, i, trainer.learning_rate,
                                args.batch_size / (time.time() - btic), name1,
                                loss1, name2, loss2, name3, loss3, name4,
                                loss4))
                btic = time.time()

        if (not args.horovod or hvd.rank() == 0):
            name1, loss1 = obj_metrics.get()
            name2, loss2 = center_metrics.get()
            name3, loss3 = scale_metrics.get()
            name4, loss4 = cls_metrics.get()
            logger.info(
                '[Epoch {}] Training cost: {:.3f}, {}={:.3f}, {}={:.3f}, {}={:.3f}, {}={:.3f}'
                .format(epoch, (time.time() - tic), name1, loss1, name2, loss2,
                        name3, loss3, name4, loss4))
            if not (epoch + 1) % args.val_interval:
                # consider reduce the frequency of validation to save time
                map_name, mean_ap = validate(net, val_data, ctx, eval_metric)
                val_msg = '\n'.join(
                    ['{}={}'.format(k, v) for k, v in zip(map_name, mean_ap)])
                logger.info('[Epoch {}] Validation: \n{}'.format(
                    epoch, val_msg))
                current_map = float(mean_ap[-1])
            else:
                current_map = 0.
            save_params(net, best_map, current_map, epoch, args.save_interval,
                        args.save_prefix)
    def train(ctx):
        if isinstance(ctx, mx.Context):
            ctx = [ctx]

        if opt.no_wd:
            for k, v in net.collect_params('.*beta|.*gamma|.*bias').items():
                v.wd_mult = 0.0

        if opt.partial_bn:
            train_patterns = "None"
            if 'inceptionv3' in opt.model:
                train_patterns = '.*weight|.*bias|inception30_batchnorm0_gamma|inception30_batchnorm0_beta|inception30_batchnorm0_running_mean|inception30_batchnorm0_running_var'
            elif 'inceptionv1' in opt.model:
                train_patterns = '.*weight|.*bias|googlenet0_batchnorm0_gamma|googlenet0_batchnorm0_beta|googlenet0_batchnorm0_running_mean|googlenet0_batchnorm0_running_var'
            else:
                logger.info(
                    'Current model does not support partial batch normalization.'
                )

            # trainer = gluon.Trainer(net.collect_params(train_patterns), optimizer, optimizer_params, update_on_kvstore=False)
            trainer = gluon.Trainer(net.collect_params(train_patterns),
                                    optimizer,
                                    update_on_kvstore=False)

        elif opt.freeze_bn:
            train_patterns = '.*weight|.*bias'
            # trainer = gluon.Trainer(net.collect_params(train_patterns), optimizer, optimizer_params, update_on_kvstore=False)
            trainer = gluon.Trainer(net.collect_params(train_patterns),
                                    optimizer,
                                    update_on_kvstore=False)

        else:
            # trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params, update_on_kvstore=False)
            trainer = gluon.Trainer(net.collect_params(),
                                    optimizer,
                                    update_on_kvstore=False)

        if opt.accumulate > 1:
            params = [
                p for p in net.collect_params().values()
                if p.grad_req != 'null'
            ]
            for p in params:
                p.grad_req = 'add'

        if opt.resume_states is not '':
            trainer.load_states(opt.resume_states)

        if opt.use_amp:
            amp.init_trainer(trainer)

        L = gluon.loss.SoftmaxCrossEntropyLoss()

        best_val_score = 0
        lr_decay_count = 0
        #compute weights
        weights = get_weights(opt).reshape(1, opt.num_classes)
        weights = mx.nd.array(weights, ctx=mx.gpu(0))

        for epoch in range(opt.resume_epoch, opt.num_epochs):
            tic = time.time()
            train_metric.reset()
            btic = time.time()
            num_train_iter = len(train_data)
            train_loss_epoch = 0
            train_loss_iter = 0

            for i, batch in tqdm(enumerate(train_data)):
                data, label = batch_fn(batch, ctx)

                with ag.record():
                    outputs = []
                    for _, X in enumerate(data):
                        X = X.reshape((-1, ) + X.shape[2:])
                        # pred = net(X.astype(opt.dtype, copy=False))
                        pred = net(X)
                        outputs.append(pred)
                    if (opt.balanced):
                        loss = [
                            L(yhat, y.astype(opt.dtype, copy=False), weights)
                            for yhat, y in zip(outputs, label)
                        ]

                    else:
                        loss = [
                            L(yhat, y.astype(opt.dtype, copy=False))
                            for yhat, y in zip(outputs, label)
                        ]

                    if opt.use_amp:
                        with amp.scale_loss(loss, trainer) as scaled_loss:
                            ag.backward(scaled_loss)
                    else:
                        ag.backward(loss)

                if opt.accumulate > 1:
                    if (i + 1) % opt.accumulate == 0:
                        trainer.step(batch_size * opt.accumulate)
                        net.collect_params().zero_grad()
                else:
                    trainer.step(batch_size)

                train_metric.update(label, outputs)
                train_loss_iter = sum([l.mean().asscalar()
                                       for l in loss]) / len(loss)
                train_loss_epoch += train_loss_iter

                train_metric_name, train_metric_score = train_metric.get()
                sw.add_scalar(tag='train_acc_top1_iter',
                              value=train_metric_score * 100,
                              global_step=epoch * num_train_iter + i)
                sw.add_scalar(tag='train_loss_iter',
                              value=train_loss_iter,
                              global_step=epoch * num_train_iter + i)
                sw.add_scalar(tag='learning_rate_iter',
                              value=trainer.learning_rate,
                              global_step=epoch * num_train_iter + i)

                if opt.log_interval and not (i + 1) % opt.log_interval:
                    logger.info(
                        'Epoch[%03d] Batch [%04d]/[%04d]\tSpeed: %f samples/sec\t %s=%f\t loss=%f\t lr=%f'
                        % (epoch, i, num_train_iter,
                           batch_size * opt.log_interval /
                           (time.time() - btic), train_metric_name,
                           train_metric_score * 100, train_loss_epoch /
                           (i + 1), trainer.learning_rate))
                    btic = time.time()

            train_metric_name, train_metric_score = train_metric.get()
            throughput = int(batch_size * i / (time.time() - tic))
            mx.ndarray.waitall()

            logger.info('[Epoch %03d] training: %s=%f\t loss=%f' %
                        (epoch, train_metric_name, train_metric_score * 100,
                         train_loss_epoch / num_train_iter))
            logger.info('[Epoch %03d] speed: %d samples/sec\ttime cost: %f' %
                        (epoch, throughput, time.time() - tic))
            sw.add_scalar(tag='train_loss_epoch',
                          value=train_loss_epoch / num_train_iter,
                          global_step=epoch)

            if not opt.train_only:
                acc_top1_val, acc_top5_val, loss_val = test(ctx, val_data)

                logger.info(
                    '[Epoch %03d] validation: acc-top1=%f acc-top5=%f loss=%f'
                    %
                    (epoch, acc_top1_val * 100, acc_top5_val * 100, loss_val))
                sw.add_scalar(tag='val_loss_epoch',
                              value=loss_val,
                              global_step=epoch)
                sw.add_scalar(tag='val_acc_top1_epoch',
                              value=acc_top1_val * 100,
                              global_step=epoch)

                if acc_top1_val > best_val_score:
                    best_val_score = acc_top1_val
                    net.save_parameters('%s/%.4f-%s-%s-%03d-best.params' %
                                        (opt.save_dir, best_val_score,
                                         opt.dataset, model_name, epoch))
                    trainer.save_states('%s/%.4f-%s-%s-%03d-best.states' %
                                        (opt.save_dir, best_val_score,
                                         opt.dataset, model_name, epoch))
                # else:
                #     if opt.save_frequency and opt.save_dir and (epoch + 1) % opt.save_frequency == 0:
                #         net.save_parameters('%s/%s-%s-%03d.params'%(opt.save_dir, opt.dataset, model_name, epoch))
                #         trainer.save_states('%s/%s-%s-%03d.states'%(opt.save_dir, opt.dataset, model_name, epoch))

        # # save the last model
        # net.save_parameters('%s/%s-%s-%03d.params'%(opt.save_dir, opt.dataset, model_name, opt.num_epochs-1))
        # trainer.save_states('%s/%s-%s-%03d.states'%(opt.save_dir, opt.dataset, model_name, opt.num_epochs-1))
        def return_float(el):
            return float(el)

        try:
            #remove "trash" files
            performances = [
                get_file_stem(file).split("-")[0]
                for file in os.listdir(opt.save_dir) if "params" in file
            ]

            best_performance = sorted(performances,
                                      key=return_float,
                                      reverse=True)[0]

            params_trash = [
                os.path.join(opt.save_dir, file)
                for file in os.listdir(opt.save_dir)
                if (("params" in file) and not (best_performance in file))
            ]
            states_trash = [
                os.path.join(opt.save_dir, file)
                for file in os.listdir(opt.save_dir)
                if (("states" in file) and not (best_performance in file))
            ]
            trash_files = params_trash + states_trash

            for file in trash_files:
                os.remove(file)
        except:
            print("Sth went wrong...")
Exemple #29
0
    def train(self):

        self.net.collect_params().reset_ctx(self.ctx)

        trainer = gluon.Trainer(
            params=self.net.collect_params(),
            optimizer='sgd',
            optimizer_params={
                'learning_rate': self.lr,
                'wd': self.wd,
                'momentum': self.momentum
            },
            update_on_kvstore=(False if self.use_amp else None))

        if self.use_amp:
            amp.init_trainer(trainer)

        lr_decay = self.lr_decay
        lr_steps = sorted(
            [float(ls) for ls in self.lr_decay_epoch.split(',') if ls.strip()])

        mbox_loss = SSDMultiBoxLoss()
        ce_metric = mx.metric.Loss('CrossEntropy')
        smoothl1_metric = mx.metric.Loss('SmoothL1')

        logging.info('Start training from scratch...')

        for epoch in range(self.epoch):
            while lr_steps and epoch > lr_steps[0]:
                new_lr = trainer.learning_rate * lr_decay
                lr_steps.pop(0)
                trainer.set_learning_rate(new_lr)
                logging.info("Epoch {} Set learning rate to {}".format(
                    epoch, new_lr))
            ce_metric.reset()
            smoothl1_metric.reset()
            tic = time.time()
            btic = time.time()
            # reset cause save params may change
            self.net.collect_params().reset_ctx(self.ctx)
            self.net.hybridize(static_alloc=True, static_shape=True)
            for i, batch in enumerate(self.train_data):
                data = [d.data[0] for d in batch]
                box_targets = [d.label[0] for d in batch]
                cls_targets = [
                    nd.cast(d.label[1], dtype='float32') for d in batch
                ]

                with autograd.record():
                    cls_preds = []
                    box_preds = []
                    for x in data:
                        cls_pred, box_pred, _ = self.net(x)
                        cls_preds.append(cls_pred)
                        box_preds.append(box_pred)
                    sum_loss, cls_loss, box_loss = mbox_loss(
                        cls_preds, box_preds, cls_targets, box_targets)
                    if self.use_amp:
                        with amp.scale_loss(sum_loss, trainer) as scaled_loss:
                            autograd.backward(scaled_loss)
                    else:
                        autograd.backward(sum_loss)
                # since we have already normalized the loss, we don't want to normalize
                # by batch-size anymore
                trainer.step(1)
                ce_metric.update(0, [l * self.batch_size for l in cls_loss])
                smoothl1_metric.update(0,
                                       [l * self.batch_size for l in box_loss])
                if i > 0 and i % 50 == 0:
                    name1, loss1 = ce_metric.get()
                    name2, loss2 = smoothl1_metric.get()
                    logging.info('Epoch {} Batch {} Speed: {:.3f} samples/s, {}={:.3f}, {}={:.3f}'.\
                           format(epoch, i, self.batch_size/(time.time()-btic), name1, loss1, name2, loss2))

                btic = time.time()
            map_name, mean_ap = self.validation()
            val_msg = '\n'.join(
                ['{}={}'.format(k, v) for k, v in zip(map_name, mean_ap)])
            logging.info('[Epoch {}] Validation: \n{}'.format(epoch, val_msg))
            self.save_params(epoch)
Exemple #30
0
def model_fit(args, net, train_data, eval_metric, optimizer,
        optimizer_params, lr_scheduler, eval_data, kvstore, kv,
        begin_epoch, num_epoch, model_prefix, report, print_loss):

    if not isinstance(eval_metric, mx.metric.EvalMetric):
        eval_metric = mx.metric.create(eval_metric)
    loss_metric = ScalarMetric()

    if 'horovod' in kvstore:
        trainer = hvd.DistributedTrainer(net.collect_params(), optimizer, optimizer_params)
    else:
        trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params,
                                kvstore=kv, update_on_kvstore=False)

    if args.amp:
        amp.init_trainer(trainer)

    sparse_label_loss = (args.label_smoothing == 0 and args.mixup == 0)
    loss = gluon.loss.SoftmaxCrossEntropyLoss(sparse_label=sparse_label_loss)
    loss.hybridize(static_shape=True, static_alloc=True)

    local_batch_size = train_data.batch_size
    total_batch_size = local_batch_size * train_data._num_gpus * (hvd.size() if 'horovod' in kvstore else 1)
    durations = []

    epoch_size = get_epoch_size(args, kv)

    def transform_data(images, labels):
        if args.mixup != 0:
            coeffs = mx.nd.array(np.random.beta(args.mixup, args.mixup, size=images.shape[0])).as_in_context(images.context)
            image_coeffs = coeffs.astype(images.dtype, copy=False).reshape(*coeffs.shape, 1, 1, 1)
            ret_images = image_coeffs * images + (1 - image_coeffs) * images[::-1]

            ret_labels = label_smoothing(labels, args.num_classes, args.label_smoothing)
            label_coeffs = coeffs.reshape(*coeffs.shape, 1)
            ret_labels = label_coeffs * ret_labels + (1 - label_coeffs) * ret_labels[::-1]
        else:
            ret_images = images
            if not sparse_label_loss:
                ret_labels = label_smoothing(labels, args.num_classes, args.label_smoothing)
            else:
                ret_labels = labels

        return ret_images, ret_labels


    best_accuracy = -1
    for epoch in range(begin_epoch, num_epoch):
        tic = time.time()
        train_data.reset()
        eval_metric.reset()
        loss_metric.reset()
        btic = time.time()

        logging.info('Starting epoch {}'.format(epoch))
        outputs = []
        for i, batches in enumerate(train_data):
            # synchronize to previous iteration
            for o in outputs:
                o.wait_to_read()

            trainer.set_learning_rate(lr_scheduler(epoch + i / epoch_size))

            data = [b.data[0] for b in batches]
            label = [b.label[0].as_in_context(b.data[0].context) for b in batches]
            orig_label = label

            data, label = zip(*starmap(transform_data, zip(data, label)))

            outputs = []
            Ls = []
            with ag.record():
                for x, y in zip(data, label):
                    z = net(x)
                    L = loss(z, y)
                    # store the loss and do backward after we have done forward
                    # on all GPUs for better speed on multiple GPUs.
                    Ls.append(L)
                    outputs.append(z)

                if args.amp:
                    with amp.scale_loss(Ls, trainer) as scaled_loss:
                        ag.backward(scaled_loss)
                else:
                    ag.backward(Ls)

            if 'horovod' in kvstore:
                trainer.step(local_batch_size)
            else:
                trainer.step(total_batch_size)

            if print_loss:
                loss_metric.update(..., np.mean([l.asnumpy() for l in Ls]).item())
            eval_metric.update(orig_label, outputs)

            if args.disp_batches and not (i + 1) % args.disp_batches:
                name, acc = eval_metric.get()
                if print_loss:
                    name = [loss_metric.get()[0]] + name
                    acc = [loss_metric.get()[1]] + acc

                logging.info('Epoch[{}] Batch [{}-{}]\tSpeed: {} samples/sec\tLR: {}\t{}'.format(
                    epoch, (i // args.disp_batches) * args.disp_batches, i,
                    args.disp_batches * total_batch_size / (time.time() - btic), trainer.learning_rate,
                    '\t'.join(list(map(lambda x: '{}: {:.6f}'.format(*x), zip(name, acc))))))
                eval_metric.reset_local()
                loss_metric.reset_local()
                btic = time.time()

            durations.append(time.time() - tic)
            tic = time.time()


        add_metrics_to_report(report, 'train', dict(eval_metric.get_global_name_value()), durations, total_batch_size, loss_metric if print_loss else None)

        if args.mode == 'train_val':
            logging.info('Validating epoch {}'.format(epoch))
            score = model_score(args, net, eval_data, eval_metric, kvstore, report)
            for name, value in zip(*score):
                logging.info('Epoch[{}] Validation {:20}: {}'.format(epoch, name, value))

            score = dict(zip(*score))
            accuracy = score.get('accuracy', -1)
            save_checkpoint(net, epoch, accuracy, best_accuracy, model_prefix, args.save_frequency, kvstore)
            best_accuracy = max(best_accuracy, accuracy)