def test_single_method():
    N = 1000
    constant = LRScheduler('constant', base_lr=0, target_lr=1, niters=N)
    linear = LRScheduler('linear', base_lr=1, target_lr=2, niters=N)
    cosine = LRScheduler('cosine', base_lr=3, target_lr=1, niters=N)
    poly = LRScheduler('poly', base_lr=1, target_lr=0, niters=N, power=2)
    step = LRScheduler('step',
                       base_lr=1,
                       target_lr=0,
                       niters=N,
                       step_iter=[100, 500],
                       step_factor=0.1)
    step2 = LRScheduler('step',
                        base_lr=1,
                        target_lr=0,
                        nepochs=2,
                        iters_per_epoch=N / 2,
                        step_iter=[100, 500],
                        step_factor=0.1)
    step3 = LRScheduler('step',
                        base_lr=1,
                        target_lr=0,
                        nepochs=100,
                        iters_per_epoch=N / 100,
                        step_epoch=[10, 50],
                        step_factor=0.1)

    # Test numerical value
    for i in range(N):
        compare(constant, i, 0)

        expect_linear = 2 + (1 - 2) * (1 - i / (N - 1))
        compare(linear, i, expect_linear)

        expect_cosine = 1 + (3 - 1) * ((1 + cos(pi * i / (N - 1))) / 2)
        compare(cosine, i, expect_cosine)

        expect_poly = 0 + (1 - 0) * (pow(1 - i / (N - 1), 2))
        compare(poly, i, expect_poly)

        if i < 100:
            expect_step = 1
        elif i < 500:
            expect_step = 0.1
        else:
            expect_step = 0.01
        compare(step, i, expect_step)
        compare(step2, i, expect_step)
        compare(step3, i, expect_step)

    # Test out-of-range updates
    for i in range(10):
        constant.update(i - 3)
        linear.update(i - 3)
        cosine.update(i - 3)
        poly.update(i - 3)
Beispiel #2
0
class Trainer(object):
    def __init__(self,
                 flag,
                 batch_size,
                 use_global_stats=True,
                 checkpoint_interval=5,
                 epochs=50,
                 learning_rate=1.e-4,
                 momentum=0.9,
                 weight_decay=1.e-4,
                 train_OS=16,
                 train_split='train_aug',
                 val_split='val',
                 resume=None,
                 test_batch_size=None,
                 data_root=os.path.expanduser('~/.mxnet/datasets/voc'),
                 num_workers=4):

        if test_batch_size is None:
            test_batch_size = batch_size

        self.running_flag = flag
        self.checkpoint_interval = checkpoint_interval

        # dataset and dataloader
        train_dataset = VOCAugSegmentation(root=data_root, split=train_split)
        val_datset = VOCAugSegmentation(root=data_root, split=val_split)
        self.train_data = gluon.data.DataLoader(train_dataset,
                                                batch_size,
                                                shuffle=True,
                                                last_batch='rollover',
                                                num_workers=num_workers)
        self.eval_data = gluon.data.DataLoader(val_datset,
                                               test_batch_size,
                                               last_batch='keep',
                                               num_workers=num_workers)

        # create network
        model = DeepLabv3p(OS=train_OS,
                           classes=21,
                           use_global_stats=use_global_stats)
        self.net = model
        print(model)

        # resume checkpoint if needed
        if resume is not None:
            if os.path.isfile(resume):
                model.load_params(resume, ctx=mx.gpu())
            else:
                raise RuntimeError(
                    "=> no checkpoint found at '{}'".format(resume))
        else:
            model.initialize(ctx=mx.gpu())

        # create criterion
        self.criterion = SoftmaxCrossEntropyLoss()

        # optimizer and lr scheduling
        self.lr_scheduler = LRScheduler(mode='poly',
                                        baselr=learning_rate,
                                        niters=len(self.train_data),
                                        nepochs=epochs)
        self.optimizer = gluon.Trainer(
            self.net.collect_params(), 'sgd', {
                'lr_scheduler': self.lr_scheduler,
                'wd': weight_decay,
                'momentum': momentum,
                'multi_precision': True
            })

    def training(self, epoch):
        tbar = tqdm(self.train_data)
        train_loss = 0.
        for i, (data, target) in enumerate(tbar):
            data = data.copyto(mx.gpu())
            target = target.copyto(mx.gpu())
            self.lr_scheduler.update(i, epoch)
            with autograd.record(True):
                outputs = self.net(data)
                losses = self.criterion(outputs, target)
                loss = losses.mean()
                mx.nd.waitall()
                loss.backward()
            self.optimizer.step(batch_size=1)  # dummy expression
            train_loss += loss.asscalar()
            tbar.set_description('Epoch %d, training loss %.3f' %
                                 (epoch, train_loss / (i + 1)))
            mx.nd.waitall()
            # break

    def validation(self, epoch, train=False):
        if train:
            loader = self.train_data
            flag = "train"
        else:
            loader = self.eval_data
            flag = 'val'

        tbar = tqdm(loader)
        total_inter, total_union, total_correct, total_label = (0, ) * 4
        for i, (x, y) in enumerate(tbar):
            x = x.copyto(mx.gpu())
            y = y.copyto(mx.gpu())
            pred = self.net(x)
            correct, labeled = batch_pix_accuracy(output=pred, target=y)
            inter, union = batch_intersection_union(output=pred,
                                                    target=y,
                                                    nclass=21)
            total_correct += correct.astype('int64')
            total_label += labeled.astype('int64')
            total_inter += inter.astype('int64')
            total_union += union.astype('int64')
            pix_acc = np.float64(1.0) * total_correct / (
                np.spacing(1, dtype=np.float64) + total_label)
            IoU = np.float64(1.0) * total_inter / (
                np.spacing(1, dtype=np.float64) + total_union)
            mIoU = IoU.mean()
            tbar.set_description('%s - Epoch %s, pix_acc: %.4f, mIoU: %.4f' %
                                 (flag, epoch, pix_acc, mIoU))
            mx.nd.waitall()
            # break

        return pix_acc, mIoU

    def save_checkpoint(self, epoch, is_best=False):
        save_checkpoint(self.running_flag, self.net, epoch,
                        self.checkpoint_interval, is_best)
class Trainer(object):
    def __init__(self, flag, batch_size,
                 use_global_stats=True,
                 checkpoint_interval=5,
                 epochs=50,
                 learning_rate=1.e-4,
                 momentum=0.9,
                 weight_decay=4.e-5,
                 train_OS=16,
                 train_split='train_aug',
                 val_split='val',
                 resume=None,
                 test_batch_size=None,
                 data_root=os.path.expanduser('~/.mxnet/datasets/voc'),
                 ctx=[mx.gpu()],
                 norm_layer=gluon.nn.BatchNorm,
                 num_workers=4):

        if test_batch_size is None:
            test_batch_size = batch_size

        self.running_flag = flag
        self.checkpoint_interval = checkpoint_interval
        self.batch_size = batch_size

        # dataset and dataloader
        train_dataset = VOCAugSegmentation(root=data_root, split=train_split)
        val_datset = VOCAugSegmentation(root=data_root, split=val_split)
        self.train_data = gluon.data.DataLoader(train_dataset, batch_size, shuffle=True, last_batch='rollover',
                                                num_workers=num_workers)
        self.eval_data = gluon.data.DataLoader(val_datset, test_batch_size,
                                               last_batch='keep', num_workers=num_workers)

        # create network
        model = DeepLabv3p(OS=train_OS, classes=21, use_global_stats=use_global_stats, norm_layer=norm_layer)
        print(model)

        # resume checkpoint if needed
        if resume is not None:
            if os.path.isfile(resume):
                model.load_parameters(resume, ctx=ctx)
            else:
                raise RuntimeError("=> no checkpoint found at '{}'".format(resume))
        else:
            model.initialize(ctx=ctx)

        self.net = DataParallelModel(model, ctx, sync=True)
        self.evaluator = DataParallelModel(SegEvalModel(model), ctx)

        # create criterion
        self.criterion = DataParallelCriterion(SoftmaxCrossEntropyLoss(), ctx, sync=True)

        # optimizer and lr scheduling
        self.lr_scheduler = LRScheduler(mode='poly', baselr=learning_rate, niters=len(self.train_data),
                                        nepochs=epochs)
        self.optimizer = gluon.Trainer(self.net.module.collect_params(), 'sgd',
                                       {'lr_scheduler': self.lr_scheduler,
                                        'wd': weight_decay,
                                        'momentum': momentum,
                                        'multi_precision': True})

    def training(self, epoch):
        tbar = tqdm(self.train_data)
        train_loss = 0.
        for i, (data, target) in enumerate(tbar):
            self.lr_scheduler.update(i, epoch)
            with autograd.record(train_mode=True):
                outputs = self.net(data)
                losses = self.criterion(outputs, target)
                mx.nd.waitall()
                autograd.backward(losses)
            for loss in losses:
                train_loss += loss.asnumpy()[0] / len(losses)
            self.optimizer.step(batch_size=self.batch_size)
            tbar.set_description('Epoch %d, training loss %.3f' % (epoch, train_loss / (i + 1)))
            mx.nd.waitall()
            # break

    def validation(self, epoch, train=False):
        if train:
            loader = self.train_data
            flag = "train"
        else:
            loader = self.eval_data
            flag = 'val'

        tbar = tqdm(loader)
        total_inter, total_union, total_correct, total_label = (0,) * 4
        for i, (x, y) in enumerate(tbar):
            outputs = self.evaluator(x, y)
            for (correct, labeled, inter, union) in outputs:
                total_correct += correct
                total_label += labeled
                total_inter += inter
                total_union += union
            pixAcc = 1.0 * total_correct / (np.spacing(1) + total_label)
            IoU = 1.0 * total_inter / (np.spacing(1) + total_union)
            mIoU = IoU.mean()
            tbar.set_description('%s - Epoch %s, validation pixAcc: %.4f, mIoU: %.4f' % \
                                 (flag, epoch, pixAcc, mIoU))
            mx.nd.waitall()

        return pixAcc, mIoU

    def save_checkpoint(self, epoch, is_best=False):
        save_checkpoint(self.running_flag, self.net.module, epoch, self.checkpoint_interval, is_best)
Beispiel #4
0
def train(net, train_data, val_data, eval_metric, ctx, args):
    """Training pipeline"""
    net.collect_params().reset_ctx(ctx)
    if args.no_wd:
        for k, v in net.collect_params('.*beta|.*gamma|.*bias').items():
            v.wd_mult = 0.0

    if args.label_smooth:
        net._target_generator._label_smooth = True

    if args.lr_decay_period > 0:
        lr_decay_epoch = list(
            range(args.lr_decay_period, args.epochs, args.lr_decay_period))
    else:
        lr_decay_epoch = [int(i) for i in args.lr_decay_epoch.split(',')]
    lr_scheduler = LRScheduler(mode=args.lr_mode,
                               baselr=args.lr,
                               niters=args.num_samples // args.batch_size,
                               nepochs=args.epochs,
                               step=lr_decay_epoch,
                               step_factor=args.lr_decay,
                               power=2,
                               warmup_epochs=args.warmup_epochs)

    trainer = gluon.Trainer(net.collect_params(),
                            'sgd', {
                                'wd': args.wd,
                                'momentum': args.momentum,
                                'lr_scheduler': lr_scheduler
                            },
                            kvstore='local')

    # targets
    sigmoid_ce = gluon.loss.SigmoidBinaryCrossEntropyLoss(from_sigmoid=False)
    l1_loss = gluon.loss.L1Loss()

    # metrics
    obj_metrics = mx.metric.Loss('ObjLoss')
    center_metrics = mx.metric.Loss('BoxCenterLoss')
    scale_metrics = mx.metric.Loss('BoxScaleLoss')
    cls_metrics = mx.metric.Loss('ClassLoss')

    # set up logger
    logging.basicConfig()
    logger = logging.getLogger()
    logger.setLevel(logging.INFO)
    log_file_path = args.save_prefix + '_train.log'
    log_dir = os.path.dirname(log_file_path)
    if log_dir and not os.path.exists(log_dir):
        os.makedirs(log_dir)
    fh = logging.FileHandler(log_file_path)
    logger.addHandler(fh)
    logger.info(args)
    logger.info('Start training from [Epoch {}]'.format(args.start_epoch))
    best_map = [0]
    for epoch in range(args.start_epoch, args.epochs):
        if args.mixup:
            # TODO(zhreshold): more elegant way to control mixup during runtime
            try:
                train_data._dataset.set_mixup(np.random.beta, 1.5, 1.5)
            except AttributeError:
                train_data._dataset._data.set_mixup(np.random.beta, 1.5, 1.5)
            if epoch >= args.epochs - args.no_mixup_epochs:
                try:
                    train_data._dataset.set_mixup(None)
                except AttributeError:
                    train_data._dataset._data.set_mixup(None)

        tic = time.time()
        btic = time.time()
        mx.nd.waitall()
        net.hybridize()
        for i, batch in enumerate(train_data):
            batch_size = batch[0].shape[0]
            data = gluon.utils.split_and_load(batch[0],
                                              ctx_list=ctx,
                                              batch_axis=0)
            # objectness, center_targets, scale_targets, weights, class_targets
            fixed_targets = [
                gluon.utils.split_and_load(batch[it],
                                           ctx_list=ctx,
                                           batch_axis=0) for it in range(1, 6)
            ]
            gt_boxes = gluon.utils.split_and_load(batch[6],
                                                  ctx_list=ctx,
                                                  batch_axis=0)
            sum_losses = []
            obj_losses = []
            center_losses = []
            scale_losses = []
            cls_losses = []
            with autograd.record():
                for ix, x in enumerate(data):
                    obj_loss, center_loss, scale_loss, cls_loss = net(
                        x, gt_boxes[ix], *[ft[ix] for ft in fixed_targets])
                    sum_losses.append(obj_loss + center_loss + scale_loss +
                                      cls_loss)
                    obj_losses.append(obj_loss)
                    center_losses.append(center_loss)
                    scale_losses.append(scale_loss)
                    cls_losses.append(cls_loss)
                autograd.backward(sum_losses)
            lr_scheduler.update(i, epoch)
            trainer.step(batch_size)
            obj_metrics.update(0, obj_losses)
            center_metrics.update(0, center_losses)
            scale_metrics.update(0, scale_losses)
            cls_metrics.update(0, cls_losses)
            if args.log_interval and not (i + 1) % args.log_interval:
                name1, loss1 = obj_metrics.get()
                name2, loss2 = center_metrics.get()
                name3, loss3 = scale_metrics.get()
                name4, loss4 = cls_metrics.get()
                logger.info(
                    '[Epoch {}][Batch {}], LR: {:.2E}, Speed: {:.3f} samples/sec, {}={:.3f}, {}={:.3f}, {}={:.3f}, {}={:.3f}'
                    .format(epoch, i, trainer.learning_rate,
                            batch_size / (time.time() - btic), name1, loss1,
                            name2, loss2, name3, loss3, name4, loss4))
            btic = time.time()

        name1, loss1 = obj_metrics.get()
        name2, loss2 = center_metrics.get()
        name3, loss3 = scale_metrics.get()
        name4, loss4 = cls_metrics.get()
        logger.info(
            '[Epoch {}] Training cost: {:.3f}, {}={:.3f}, {}={:.3f}, {}={:.3f}, {}={:.3f}'
            .format(epoch, (time.time() - tic), name1, loss1, name2, loss2,
                    name3, loss3, name4, loss4))
        if not (epoch + 1) % args.val_interval:
            # consider reduce the frequency of validation to save time
            map_name, mean_ap = validate(net, val_data, ctx, eval_metric)
            val_msg = '\n'.join(
                ['{}={}'.format(k, v) for k, v in zip(map_name, mean_ap)])
            logger.info('[Epoch {}] Validation: \n{}'.format(epoch, val_msg))
            current_map = float(mean_ap[-1])
        else:
            current_map = 0.
        save_params(net, best_map, current_map, epoch, args.save_interval,
                    args.save_prefix)
Beispiel #5
0
def train(net, train_data, val_data, eval_metric, polygon_metric, ctx, args):
    """Training pipeline"""
    net.collect_params().reset_ctx(ctx)

    # lr decay policy
    lr_decay = float(args.lr_decay)
    lr_steps = sorted(
        [float(ls) for ls in args.lr_decay_epoch.split(',') if ls.strip()])

    lr_scheduler = LRScheduler(mode=args.lr_mode,
                               baselr=args.lr,
                               niters=args.num_samples // args.batch_size,
                               nepochs=args.epochs,
                               step=lr_steps,
                               step_factor=lr_decay,
                               power=2,
                               warmup_epochs=args.warmup_epochs)

    trainer = gluon.Trainer(
        net.collect_params(), 'sgd', {
            'learning_rate': args.lr,
            'wd': args.wd,
            'momentum': args.momentum,
            'lr_scheduler': lr_scheduler
        })

    mbox_loss = gcv.loss.SSDMultiBoxLoss()
    ce_metric = mx.metric.Loss('cls_loss')
    smoothl1_metric = mx.metric.Loss('box_loss')
    coef_center_metric = mx.metric.Loss('center_loss')
    coef_metric = mx.metric.Loss('coef_loss')

    # set up logger
    logging.basicConfig()
    logger = logging.getLogger()
    logger.setLevel(logging.INFO)
    log_file_path = args.save_prefix + '_train.log'
    log_dir = os.path.dirname(log_file_path)
    if log_dir and not os.path.exists(log_dir):
        os.makedirs(log_dir)
    fh = logging.FileHandler(log_file_path)
    logger.addHandler(fh)
    logger.info(args)
    logger.info('Start training from [Epoch {}]'.format(args.start_epoch))
    best_map = [0]
    for epoch in range(args.start_epoch, args.epochs):
        # while lr_steps and epoch >= lr_steps[0]:
        #     new_lr = trainer.learning_rate * lr_decay
        #     lr_steps.pop(0)
        #     trainer.set_learning_rate(new_lr)
        #     logger.info("[Epoch {}] Set learning rate to {}".format(epoch, new_lr))
        ce_metric.reset()
        smoothl1_metric.reset()
        coef_metric.reset()
        coef_center_metric.reset()

        tic = time.time()
        btic = time.time()
        net.hybridize()
        for i, batch in enumerate(train_data):
            batch_size = batch[0].shape[0]
            data = gluon.utils.split_and_load(batch[0],
                                              ctx_list=ctx,
                                              batch_axis=0)
            cls_targets = gluon.utils.split_and_load(batch[1],
                                                     ctx_list=ctx,
                                                     batch_axis=0)
            box_targets = gluon.utils.split_and_load(batch[2],
                                                     ctx_list=ctx,
                                                     batch_axis=0)
            coef_center_targets = gluon.utils.split_and_load(batch[3],
                                                             ctx_list=ctx,
                                                             batch_axis=0)
            coef_targets = gluon.utils.split_and_load(batch[4],
                                                      ctx_list=ctx,
                                                      batch_axis=0)

            with autograd.record():
                cls_preds = []
                box_preds = []
                coef_preds = []
                coef_center_preds = []
                for x in data:
                    cls_pred, box_pred, _, coef_center_pred, coef_pred = net(x)
                    # print(cls_pred.shape, box_pred.shape, coef_center_pred.shape)
                    cls_preds.append(cls_pred)
                    box_preds.append(box_pred)
                    coef_preds.append(coef_pred)
                    coef_center_preds.append(coef_center_pred)

                sum_loss, cls_loss, box_loss, coef_center_loss, coef_loss = mbox_loss(
                    cls_preds, box_preds, coef_center_preds, coef_preds,
                    cls_targets, box_targets, coef_center_targets,
                    coef_targets)
                autograd.backward(sum_loss)
            # since we have already normalized the loss, we don't want to normalize
            # by batch-size anymore
            trainer.step(1)

            lr_scheduler.update(i, epoch)

            coef_center_metric.update(
                0, [l * batch_size for l in coef_center_loss])
            coef_metric.update(0, [l * batch_size for l in coef_loss])
            ce_metric.update(0, [l * batch_size for l in cls_loss])
            smoothl1_metric.update(0, [l * batch_size for l in box_loss])
            if args.log_interval and not (i + 1) % args.log_interval:
                name1, loss1 = ce_metric.get()
                name2, loss2 = smoothl1_metric.get()
                name3, loss3 = coef_center_metric.get()
                name4, loss4 = coef_metric.get()
                logger.info(
                    '[Epoch {}][Batch {}], Speed: {:.3f} samples/sec, LR:{}, {}={:.3f}, {}={:.3f}, {}={:.3f}, {}={:.3f}'
                    .format(epoch, i, batch_size / (time.time() - btic),
                            trainer.learning_rate, name1, loss1, name2, loss2,
                            name3, loss3, name4, loss4))
            btic = time.time()

        name1, loss1 = ce_metric.get()
        name2, loss2 = smoothl1_metric.get()
        name3, loss3 = coef_center_metric.get()
        name4, loss4 = coef_metric.get()
        logger.info(
            '[Epoch {}] Training cost: {:.3f}, {}={:.3f}, {}={:.3f}, {}={:.3f}, {}={:.3f}'
            .format(epoch, (time.time() - tic), name1, loss1, name2, loss2,
                    name3, loss3, name4, loss4))
        if (epoch + 1) % args.val_interval == 0:
            # consider reduce the frequency of validation to save time
            map_bbox, map_polygon = validate(net, val_data, ctx, eval_metric,
                                             polygon_metric)
            map_name, mean_ap = map_bbox
            polygonmap_name, polygonmean_ap = map_polygon
            val_msg = '\n'.join(
                ['{}={}'.format(k, v) for k, v in zip(map_name, mean_ap)])
            logger.info('[Epoch {}] Validation: \n{}'.format(epoch, val_msg))
            polygonval_msg = '\n'.join([
                '{}={}'.format(k, v)
                for k, v in zip(polygonmap_name, polygonmean_ap)
            ])
            logger.info('[Epoch {}] PolygonValidation: \n{}'.format(
                epoch, polygonval_msg))
            current_map = float(polygonmean_ap[-1])
        else:
            current_map = 0.
        save_params(net, best_map, current_map, epoch, args.save_interval,
                    args.save_prefix)
Beispiel #6
0
class Trainer(object):
    def __init__(self, args):
        self.args = args
        # image transform
        input_transform = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize([.485, .456, .406], [.229, .224, .225]),
        ])
        # dataset and dataloader
        data_kwargs = {'transform': input_transform, 'base_size': args.base_size,
                       'crop_size': args.crop_size}
        trainset = get_segmentation_dataset(
            args.dataset, split=args.train_split, mode='train', **data_kwargs)
        valset = get_segmentation_dataset(
            args.dataset, split='val', mode='val', **data_kwargs)
        self.train_data = gluon.data.DataLoader(
            trainset, args.batch_size, shuffle=True, last_batch='rollover',
            num_workers=args.workers)
        self.eval_data = gluon.data.DataLoader(valset, args.test_batch_size,
            last_batch='rollover', num_workers=args.workers)
        # create network
        if args.model_zoo is not None:
            model = get_model(args.model_zoo, pretrained=True)
        else:
            model = get_segmentation_model(model=args.model, dataset=args.dataset,
                                           backbone=args.backbone, norm_layer=args.norm_layer,
                                           norm_kwargs=args.norm_kwargs, aux=args.aux,
                                           crop_size=args.crop_size)
        model.cast(args.dtype)
        print(model)
        self.net = DataParallelModel(model, args.ctx, args.syncbn)
        self.evaluator = DataParallelModel(SegEvalModel(model), args.ctx)
        # resume checkpoint if needed
        if args.resume is not None:
            if os.path.isfile(args.resume):
                model.load_parameters(args.resume, ctx=args.ctx)
            else:
                raise RuntimeError("=> no checkpoint found at '{}'" \
                    .format(args.resume))
        # create criterion
        criterion = MixSoftmaxCrossEntropyLoss(args.aux, aux_weight=args.aux_weight)
        self.criterion = DataParallelCriterion(criterion, args.ctx, args.syncbn)
        # optimizer and lr scheduling
        self.lr_scheduler = LRScheduler(mode='poly', baselr=args.lr,
                                        niters=len(self.train_data), 
                                        nepochs=args.epochs)
        kv = mx.kv.create(args.kvstore)
        optimizer_params = {'lr_scheduler': self.lr_scheduler,
                            'wd':args.weight_decay,
                            'momentum': args.momentum}
        if args.dtype == 'float16':
            optimizer_params['multi_precision'] = True

        if args.no_wd:
            for k, v in self.net.module.collect_params('.*beta|.*gamma|.*bias').items():
                v.wd_mult = 0.0

        self.optimizer = gluon.Trainer(self.net.module.collect_params(), 'sgd',
                                       optimizer_params, kvstore = kv)
        # evaluation metrics
        self.metric = gluoncv.utils.metrics.SegmentationMetric(trainset.num_class)

    def training(self, epoch):
        tbar = tqdm(self.train_data)
        train_loss = 0.0
        alpha = 0.2
        for i, (data, target) in enumerate(tbar):
            self.lr_scheduler.update(i, epoch)
            with autograd.record(True):
                outputs = self.net(data.astype(args.dtype, copy=False))
                losses = self.criterion(outputs, target)
                mx.nd.waitall()
                autograd.backward(losses)
            self.optimizer.step(self.args.batch_size)
            for loss in losses:
                train_loss += loss.asnumpy()[0] / len(losses)
            tbar.set_description('Epoch %d, training loss %.3f'%\
                (epoch, train_loss/(i+1)))
            mx.nd.waitall()

        # save every epoch
        save_checkpoint(self.net.module, self.args, False)

    def validation(self, epoch):
        #total_inter, total_union, total_correct, total_label = 0, 0, 0, 0
        self.metric.reset()
        tbar = tqdm(self.eval_data)
        for i, (data, target) in enumerate(tbar):
            outputs = self.evaluator(data.astype(args.dtype, copy=False))
            outputs = [x[0] for x in outputs]
            targets = mx.gluon.utils.split_and_load(target, args.ctx, even_split=False)
            self.metric.update(targets, outputs)
            pixAcc, mIoU = self.metric.get()
            tbar.set_description('Epoch %d, validation pixAcc: %.3f, mIoU: %.3f'%\
                (epoch, pixAcc, mIoU))
            mx.nd.waitall()
Beispiel #7
0
        label = gluon.utils.split_and_load(batch[1],
                                           ctx_list=[context],
                                           batch_axis=0)
        weight = gluon.utils.split_and_load(batch[2],
                                            ctx_list=[context],
                                            batch_axis=0)

        with ag.record():
            outputs = [net(X) for X in data]
            loss = [
                L(yhat, y, w) for yhat, y, w in zip(outputs, label, weight)
            ]

        for l in loss:
            l.backward()
        lr_scheduler.update(i, epoch)
        trainer.step(batch_size)

        metric.update(label, outputs)

    break

#############################################################################
# Due to limitation on the resources, we only train the model for one batch in this tutorial.
#
# Please checkout the full :download:`training script
# <../../../scripts/pose/simple_pose/train_simple_pose.py>` to reproduce our results.
#
# References
# ----------
#
Beispiel #8
0
class Trainer(object):
    def __init__(self, args):
        self.args = args

        self.two_model = False  ##
        self.semi = False

        # image transform
        input_transform = transforms.Compose([
            transforms.ToTensor(),
            # transforms.Normalize([.485, .456, .406], [.229, .224, .225]),
            # transforms.Normalize([0, 0, 0], [1, 1, 1]), # ([0, 0, 0], [1, 1, 1])
            # transforms.Normalize([0], [1]), # this is for 1 channel: ([0], [1]) ([556.703], [482.175])
        ])

        # dataset and dataloader
        data_kwargs = {
            'transform': input_transform,
            'base_size': args.base_size,
            'crop_size': args.crop_size
        }

        trainset = get_segmentation_dataset(args.dataset,
                                            split=args.train_split,
                                            mode='train',
                                            **data_kwargs)

        valset = get_segmentation_dataset(args.dataset,
                                          split='val',
                                          mode='val',
                                          **data_kwargs)

        self.train_data = gluon.data.DataLoader(trainset,
                                                args.batch_size,
                                                shuffle=True,
                                                last_batch='rollover',
                                                num_workers=args.workers)

        self.eval_data = gluon.data.DataLoader(
            valset,
            args.batch_size,  # args.test_batch_size, [horse changed this]
            last_batch='rollover',
            num_workers=args.workers)

        # create network
        if args.model_zoo is not None:
            print('get model from the zoo.')
            model = get_model(args.model_zoo, pretrained=True)
            if self.two_model:
                self.model2 = get_model(
                    args.model_zoo, pretrained=True)  ## 2nd identical model
        else:
            print('create model.')
            model = get_segmentation_model(model=args.model,
                                           dataset=args.dataset,
                                           backbone=args.backbone,
                                           norm_layer=args.norm_layer,
                                           norm_kwargs=args.norm_kwargs,
                                           aux=args.aux,
                                           crop_size=args.crop_size,
                                           pretrained=False)
            if self.two_model:
                self.model2 = get_segmentation_model(
                    model=args.model,
                    dataset=args.dataset,
                    backbone=args.backbone,
                    norm_layer=args.norm_layer,
                    norm_kwargs=args.norm_kwargs,
                    aux=args.aux,
                    crop_size=args.crop_size,
                    pretrained=False)

        model.cast(args.dtype)
        if self.two_model:
            self.model2.cast(args.dtype)
        # print(model) # don't print model
        # print(help(model.collect_params))
        # >>> Notice here <<<
        # model.initialize() # horse ref: https://discuss.mxnet.io/t/object-detection-transfer-learning/2477/2
        ''' '''
        self.net = DataParallelModel(model, args.ctx, args.syncbn)
        self.evaluator = DataParallelModel(SegEvalModel(model), args.ctx)

        if self.two_model:
            self.evaluator2 = DataParallelModel(SegEvalModel(self.model2),
                                                args.ctx)

        # resume checkpoint if needed
        if args.resume is not None:
            if os.path.isfile(args.resume):
                if not horse_changed:
                    model.load_parameters(args.resume, ctx=args.ctx)
                if horse_changed:
                    model.load_parameters(args.resume,
                                          ctx=args.ctx,
                                          allow_missing=True,
                                          ignore_extra=True)
            else:
                raise RuntimeError("=> no checkpoint found at '{}'" \
                    .format(args.resume))
        ''' 
        self.net = DataParallelModel(model, args.ctx, args.syncbn)
        self.evaluator = DataParallelModel(SegEvalModel(model), args.ctx)
        '''

        # create criterion
        criterion = MixSoftmaxCrossEntropyLoss(args.aux,
                                               aux_weight=args.aux_weight)
        self.criterion = DataParallelCriterion(criterion, args.ctx,
                                               args.syncbn)

        # optimizer and lr scheduling
        self.lr_scheduler = LRScheduler(mode='poly',
                                        baselr=args.lr,
                                        niters=len(self.train_data),
                                        nepochs=args.epochs)

        kv = mx.kv.create(args.kvstore)
        optimizer_params = {
            'lr_scheduler': self.lr_scheduler,
            'wd': args.weight_decay,
            'momentum': args.momentum
        }

        if args.dtype == 'float16':
            optimizer_params['multi_precision'] = True

        if args.no_wd:
            for k, v in self.net.module.collect_params(
                    '.*beta|.*gamma|.*bias').items():
                v.wd_mult = 0.0

        self.optimizer = gluon.Trainer(self.net.module.collect_params(),
                                       'sgd',
                                       optimizer_params,
                                       kvstore=kv)
        # evaluation metrics
        self.metric = gluoncv.utils.metrics.SegmentationMetric(
            trainset.num_class)

    def training(self, epoch):
        if self.two_model:
            if self.two_model:
                self.model2.load_parameters(
                    'runs/pascal_voc/deeplab/HVSMR/res50_backup.params',
                    ctx=args.ctx)  # args.resume
                self.model2.cast(args.dtype)
                self.evaluator2 = DataParallelModel(SegEvalModel(self.model2),
                                                    args.ctx)

        if horse_changed:
            print('>>> start training.')  # [horse]
            tbar = tqdm(self.train_data)
            train_loss = 0.0
            alpha = 0.2
            for i, (data, target) in enumerate(tbar):
                self.lr_scheduler.update(i, epoch)
                with autograd.record(True):
                    # >>>>>>>>>>>>>>>>>>>>
                    global print_shape
                    if print_shape:
                        print('>>> data of one batch:')
                        print(data.shape, target.shape)  # horse
                        '''
                        with open('have_a_look.pkl', 'wb') as fo:
                            pickle.dump(data.asnumpy(), fo)
                            pickle.dump(target.asnumpy(), fo)
                        '''
                        for ii in range(data.shape[1]):
                            one_sample = data[0, ii, :, :].asnumpy()
                            s_mean = np.mean(one_sample.flatten())
                            s_std = np.std(one_sample.flatten())
                            s_min = min(one_sample.flatten())
                            s_max = max(one_sample.flatten())
                            print('dim | mean | std | min | max', ii, s_mean,
                                  s_std, s_min, s_max)
                        print_shape = False
                    # >>>>>>>>>>>>>>>>>>>>
                    outputs = self.net(data.astype(args.dtype, copy=False))
                    # print('outputs:', len(outputs[0]), outputs[0][0].shape) # [horse]
                    # print('target:', target.shape)
                    # outputs: 2 (14, 3, 250, 250)
                    # target: (14, 250, 250)

                    # +++++ +++++ +++++
                    _outputs = outputs
                    _target = mx.ndarray.reshape(
                        target,
                        shape=(-3, -2))  # to be (batch_size*NUM_SEQ, 250, 250)
                    # +++++ +++++ +++++

                    # losses = self.criterion(outputs, target)
                    losses = self.criterion(_outputs, _target)
                    mx.nd.waitall()
                    autograd.backward(losses)
                self.optimizer.step(self.args.batch_size)
                for loss in losses:
                    train_loss += loss.asnumpy()[0] / len(losses)
                tbar.set_description('Epoch %d, training loss %.3f'%\
                    (epoch, train_loss/(i+1)))
                mx.nd.waitall()

            # save every epoch
            save_checkpoint(self.net.module, self.args, False)
            # ++++++++++ ++++++++++ ++++++++++ ++++++++++ ++++++++++

        if not horse_changed:
            tbar = tqdm(self.train_data)
            train_loss = 0.0
            alpha = 0.2
            for i, (data, target) in enumerate(tbar):
                self.lr_scheduler.update(i, epoch)
                with autograd.record(True):
                    outputs = self.net(data.astype(args.dtype, copy=False))
                    # print('target:', target.shape) # target: (4, 480, 480)
                    ## print('target sum before:', [i.sum() for i in target.asnumpy()]) # target sum: [389344.0, 0.0, 0.0, 188606.0]

                    # ++++++++++ ++++++++++ ++++++++++ ++++++++++ ++++++++++
                    # ++++++++++ ++++++++++ ++++++++++ ++++++++++ ++++++++++
                    if self.semi:
                        pos = np.where(
                            np.array([i.sum()
                                      for i in target.asnumpy()]) == 0)[0]
                        ## print('pos',pos)
                        if len(pos) != 0:
                            data2 = data[pos, :, :, :]
                            _outputs = self.evaluator2(
                                data2.astype(args.dtype, copy=False))
                            _outputs = [x[0] for x in _outputs]
                            label_generated = np.zeros(
                                (len(pos), target.shape[1], target.shape[2]))
                            for k in range(len(pos)):
                                ## print(_outputs[0].shape)
                                label_slice = labeler_random(
                                    _outputs[0].asnumpy()[k, 0:3, :, :],
                                    crop_size=target.shape[1],
                                    prob_cut=0.46)
                                label_generated[k, :, :] = label_slice
                            target[pos, :, :] = mx.nd.array(label_generated)
                            ## print('target sum after:', [i.sum() for i in target.asnumpy()])
                    '''         
                    if True:
                        # print('targets and outputs shape:', len(outputs), outputs[0].shape) # outputs: 1 (18, 3, 250, 250); targets: 1 (18, 250, 250)                        
                        for sample in range(2):
                            mx2img(data[sample,:,:,:], str(sample)+'.jpg')
                            mx2img(target[sample,:,:], str(sample)+'.png')       
                    '''
                    # ++++++++++ ++++++++++ ++++++++++ ++++++++++ ++++++++++
                    # ++++++++++ ++++++++++ ++++++++++ ++++++++++ ++++++++++

                    losses = self.criterion(outputs, target)
                    mx.nd.waitall()
                    autograd.backward(losses)
                self.optimizer.step(self.args.batch_size)
                for loss in losses:
                    train_loss += loss.asnumpy()[0] / len(losses)
                tbar.set_description('Epoch %d, training loss %.3f'%\
                    (epoch, train_loss/(i+1)))
                mx.nd.waitall()

            # save every epoch
            save_checkpoint(self.net.module, self.args, False)
            # ++++++++++ ++++++++++ ++++++++++ ++++++++++ ++++++++++
        ''' <- this is backup
        if not horse_changed:
            tbar = tqdm(self.train_data)
            train_loss = 0.0
            alpha = 0.2
            for i, (data, target) in enumerate(tbar):
                self.lr_scheduler.update(i, epoch)
                with autograd.record(True):
                    outputs = self.net(data.astype(args.dtype, copy=False))
                    losses = self.criterion(outputs, target)
                    mx.nd.waitall()
                    autograd.backward(losses)
                self.optimizer.step(self.args.batch_size)
                for loss in losses:
                    train_loss += loss.asnumpy()[0] / len(losses)
                tbar.set_description('Epoch %d, training loss %.3f'%\
                    (epoch, train_loss/(i+1)))
                mx.nd.waitall()

            # save every epoch
            save_checkpoint(self.net.module, self.args, False)
            # ++++++++++ ++++++++++ ++++++++++ ++++++++++ ++++++++++
        '''

    def validation(self, epoch):
        if not horse_changed:
            output_to_see = False  # False # [horse added]
            output_score_map = False  # [horse added]
            #total_inter, total_union, total_correct, total_label = 0, 0, 0, 0
            self.metric.reset()
            tbar = tqdm(self.eval_data)

            output_index = 0  # [horse added]
            for i, (data, target) in enumerate(tbar):
                # print('target', target)
                outputs = self.evaluator(data.astype(args.dtype, copy=False))
                outputs = [x[0] for x in outputs]
                # print(outputs)
                '''
                if i == 50:
                    with open('have_a_look.pkl', 'wb') as fo:
                        pickle.dump(outputs[0].asnumpy(),fo)
                '''
                targets = mx.gluon.utils.split_and_load(target,
                                                        args.ctx,
                                                        even_split=False)

                # ++++++++++ ++++++++++ ++++++++++
                if output_to_see:
                    # print('targets and outputs shape:', len(outputs), outputs[0].shape) # outputs: 1 (18, 3, 250, 250); targets: 1 (18, 250, 250)
                    output_prefix = 'outdir_tosee'
                    if not os.path.exists(output_prefix):
                        os.makedirs(output_prefix)
                    batch_size = self.args.batch_size
                    crop_size = self.args.crop_size

                    for sample in range(batch_size):
                        path = os.path.join(output_prefix,
                                            str(output_index) + '.png')
                        mx2img(outputs[0][sample, :, :, :], path)
                        output_index += 1
                # ++++++++++ ++++++++++ ++++++++++
                if output_score_map:
                    score_map_dir = 'scoredir_tosee'  # args.scoredir
                    if not os.path.exists(score_map_dir):
                        os.makedirs(score_map_dir)

                    batch_size = self.args.batch_size
                    for sample in range(batch_size):
                        # score_map_name = os.path.splitext(impath)[0] + '.pkl'
                        # score_map_path = os.path.join(score_map_dir, score_map_name)
                        score_map_path = os.path.join(
                            score_map_dir,
                            str(output_index) + '.pkl')
                        with open(score_map_path, 'wb') as fo:
                            pickle.dump(
                                outputs[0].asnumpy()[sample, 0:3, :, :], fo)
                        output_index += 1

                self.metric.update(targets, outputs)
                '''
                pixAcc, mIoU = self.metric.get()
                tbar.set_description('Epoch %d, validation pixAcc: %.3f, mIoU: %.3f'%\
                    (epoch, pixAcc, mIoU))
                '''
                pixAcc, mIoU, dice = self.metric.get()  # [horse changed]
                tbar.set_description('Epoch %d, validation pixAcc: %.3f, mIoU: %.3f, dice: %.3f, %.3f, %.3f'%\
                    (epoch, pixAcc, mIoU, dice[0], dice[1], dice[2]))

                mx.nd.waitall()

        if horse_changed:
            output_to_see = True  # False
            #total_inter, total_union, total_correct, total_label = 0, 0, 0, 0
            self.metric.reset()
            tbar = tqdm(self.eval_data)

            output_index = 0
            for i, (data, target) in enumerate(tbar):
                # print('target', target)
                outputs = self.evaluator(data.astype(args.dtype, copy=False))
                outputs = [x[0] for x in outputs]

                _target = mx.ndarray.reshape(target, shape=(-3, -2))
                targets = mx.gluon.utils.split_and_load(_target,
                                                        args.ctx,
                                                        even_split=False)

                # ++++++++++ ++++++++++ ++++++++++
                if output_to_see:
                    # print('targets and outputs shape:', len(outputs), outputs[0].shape) # outputs: 1 (18, 3, 250, 250); targets: 1 (18, 250, 250)
                    output_prefix = 'outdir_seq'
                    batch_size = self.args.batch_size
                    crop_size = self.args.crop_size
                    NUM_SEQ = int(outputs[0].shape[0] / batch_size)
                    # print(batch_size, NUM_SEQ, crop_size)

                    outputs_out = mx.ndarray.reshape(
                        outputs[0],
                        shape=(batch_size, NUM_SEQ, 3, crop_size, crop_size)
                    )  # 3 is the class number not image channel, just for convenience
                    targets_out = mx.ndarray.reshape(targets[0],
                                                     shape=(batch_size,
                                                            NUM_SEQ, crop_size,
                                                            crop_size))

                    for sample in range(batch_size):
                        for seq in range(NUM_SEQ):
                            path = os.path.join(
                                output_prefix,
                                str(output_index) + '_' + str(seq) + '.png')
                            path_mask = os.path.join(
                                output_prefix,
                                str(output_index) + '_gt_' + str(seq) + '.png')

                            mx2img(outputs_out[sample, seq, :, :, :], path)
                            mx2img(targets_out[sample, seq, :, :], path_mask)
                        output_index += 1
                # ++++++++++ ++++++++++ ++++++++++

                self.metric.update(targets, outputs)
                '''
                pixAcc, mIoU = self.metric.get()
                tbar.set_description('Epoch %d, validation pixAcc: %.3f, mIoU: %.3f'%\
                    (epoch, pixAcc, mIoU))
                '''
                pixAcc, mIoU, dice = self.metric.get()  # [horse changed]
                tbar.set_description('Epoch %d, validation pixAcc: %.3f, mIoU: %.3f, dice: %.3f, %.3f, %.3f'%\
                    (epoch, pixAcc, mIoU, dice[0], dice[1], dice[2]))

                mx.nd.waitall()
Beispiel #9
0
def train():
    epochs = 101

    lr = 0.1

    momentum = 0.9
    wd = 5e-4

    plot_period = 20

    ctx = [mx.gpu(i) for i in range(2)]
    batch_size = 256

    train_set = MNIST(train=True, transform=transform_train)
    train_data = gluon.data.DataLoader(train_set, batch_size, True, num_workers=4, last_batch='discard')
    val_set = MNIST(train=False, transform=transform_val)
    val_data = gluon.data.DataLoader(val_set, batch_size, shuffle=False, num_workers=4)

    net = MnistNet(embedding_size=2)
    net.initialize(init=mx.init.MSRAPrelu(), ctx=ctx)
    net.hybridize()

    loss = CenterLoss(10, 2, 1)
    loss.initialize(ctx=ctx)

    num_batches = len(train_set) // batch_size
    train_params = net.collect_params()
    train_params.update(loss.params)

    lr_scheduler = LRScheduler("cosine", lr,  niters=num_batches, nepochs=epochs, targetlr=1e-8,
                               warmup_epochs=10, warmup_lr=0.001)
    trainer = gluon.Trainer(train_params, 'nag', {'lr_scheduler': lr_scheduler, 'momentum': momentum, 'wd': wd})

    metric = mtc.Accuracy()
    num_batch = len(train_data)

    for epoch in range(epochs):

        plot = True if (epoch % plot_period) == 0 else False

        train_loss = 0
        metric.reset()
        tic = time.time()
        ebs, lbs = [], []

        for i, batch in enumerate(train_data):
            data = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0, even_split=False)
            labels = gluon.utils.split_and_load(batch[1], ctx_list=ctx, batch_axis=0, even_split=False)

            with ag.record():
                ots = [net(X) for X in data]
                embedds = [ot[0] for ot in ots]
                outputs = [ot[1] for ot in ots]
                losses = [loss(yhat, y, emb) for yhat, y, emb in zip(outputs, labels, embedds)]

            for l in losses:
                ag.backward(l)

            if plot:
                for es, ls in zip(embedds, labels):
                    assert len(es) == len(ls)
                    for idx in range(len(es)):
                        ebs.append(es[idx].asnumpy())
                        lbs.append(ls[idx].asscalar())

            lr_scheduler.update(i, epoch)
            trainer.step(batch_size)
            metric.update(labels, outputs)

            train_loss += sum([l.mean().asscalar() for l in losses]) / len(losses)

        _, train_acc = metric.get()
        train_loss /= num_batch

        val_acc, val_loss, val_ebs, val_lbs = validate(net, val_data, ctx, loss, plot)

        toc = time.time()
        print('[epoch % 3d] train accuracy: %.6f, train loss: %.6f | '
              'val accuracy: %.6f, val loss: %.6f, time: %.6f'
              % (epoch, train_acc, train_loss, val_acc, val_loss, toc - tic))

        if plot:
            ebs, lbs = np.vstack(ebs), np.hstack(lbs)

            plot_result(ebs, lbs, os.path.join("../../resources", "center-train-epoch{}.png".format(epoch)))
            plot_result(val_ebs, val_lbs, os.path.join("../../resources", "center-val-epoch{}.png".format(epoch)))
Beispiel #10
0
class Trainer(object):
    def __init__(self, args):
        self.args = args
        # image transform
        input_transform = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize([.485, .456, .406], [.229, .224, .225]),
        ])
        # dataset and dataloader
        trainset = get_segmentation_dataset(
            args.dataset, split='train', transform=input_transform)
        valset = get_segmentation_dataset(
            args.dataset, split='val', transform=input_transform)
        self.train_data = gluon.data.DataLoader(
            trainset, args.batch_size, shuffle=True, last_batch='rollover',
            num_workers=args.workers)
        self.eval_data = gluon.data.DataLoader(valset, args.test_batch_size,
            last_batch='keep', num_workers=args.workers)
        # create network
        model = get_segmentation_model(model=args.model, dataset=args.dataset,
                                       backbone=args.backbone, norm_layer=args.norm_layer,
                                       aux=args.aux, norm_kwargs=args.norm_kwargs)
        # model.hybridize(static_alloc=True, static_shape=True)
        print(model)
        self.net = DataParallelModel(model, args.ctx, args.syncbn)
        self.evaluator = DataParallelModel(SegEvalModel(model), args.ctx)
        # resume checkpoint if needed
        if args.resume is not None:
            if os.path.isfile(args.resume):
                model.load_params(args.resume, ctx=args.ctx)
            else:
                raise RuntimeError("=> no checkpoint found at '{}'" \
                    .format(args.resume))
        # create criterion
        criterion = SoftmaxCrossEntropyLossWithAux(args.aux)
        self.criterion = DataParallelCriterion(criterion, args.ctx, args.syncbn)
        # optimizer and lr scheduling
        self.lr_scheduler = LRScheduler(mode='poly', baselr=args.lr,
                                        niters=len(self.train_data), 
                                        nepochs=args.epochs)
        kv = mx.kv.create(args.kvstore)
        self.optimizer = gluon.Trainer(self.net.module.collect_params(), 'sgd',
                                       {'lr_scheduler': self.lr_scheduler,
                                        'wd':args.weight_decay,
                                        'momentum': args.momentum,
                                        'multi_precision': True},
                                        kvstore = kv)

    def training(self, epoch):
        tbar = tqdm(self.train_data)
        train_loss = 0.0
        for i, (data, target) in enumerate(tbar):
            self.lr_scheduler.update(i, epoch)
            with autograd.record(True):
                outputs = self.net(data)
                losses = self.criterion(outputs, target)
                mx.nd.waitall()
                autograd.backward(losses)
            self.optimizer.step(self.args.batch_size)
            for loss in losses:
                train_loss += loss.asnumpy()[0] / len(losses)
            tbar.set_description('Epoch %d, training loss %.3f'%\
                (epoch, train_loss/(i+1)))
            mx.nd.waitall()

        # save every epoch
        save_checkpoint(self.net.module, self.args, False)

    def validation(self, epoch):
        total_inter, total_union, total_correct, total_label = 0, 0, 0, 0
        tbar = tqdm(self.eval_data)
        for i, (data, target) in enumerate(tbar):
            outputs = self.evaluator(data, target)
            for (correct, labeled, inter, union) in outputs:
                total_correct += correct
                total_label += labeled
                total_inter += inter
                total_union += union
            pixAcc = 1.0 * total_correct / (np.spacing(1) + total_label)
            IoU = 1.0 * total_inter / (np.spacing(1) + total_union)
            mIoU = IoU.mean()
            tbar.set_description('Epoch %d, validation pixAcc: %.3f, mIoU: %.3f'%\
                (epoch, pixAcc, mIoU))
            mx.nd.waitall()
def train(net, train_data, val_data, eval_metric, ctx, args):
    """Training pipeline"""
    net.collect_params().reset_ctx(ctx)
    if args.no_wd:
        for k, v in net.collect_params('.*beta|.*gamma|.*bias').items():
            v.wd_mult = 0.0

    if args.label_smooth:
        net._target_generator._label_smooth = True

    if args.lr_decay_period > 0:
        lr_decay_epoch = list(range(args.lr_decay_period, args.epochs, args.lr_decay_period))
    else:
        lr_decay_epoch = [int(i) for i in args.lr_decay_epoch.split(',')]
    lr_scheduler = LRScheduler(mode=args.lr_mode,
                               baselr=args.lr,
                               niters=args.num_samples // args.batch_size,
                               nepochs=args.epochs,
                               step=lr_decay_epoch,
                               step_factor=args.lr_decay, power=2,
                               warmup_epochs=args.warmup_epochs)

    trainer = gluon.Trainer(
        net.collect_params(), 'sgd',
        {'wd': args.wd, 'momentum': args.momentum, 'lr_scheduler': lr_scheduler},
        kvstore='local')

    # targets
    sigmoid_ce = gluon.loss.SigmoidBinaryCrossEntropyLoss(from_sigmoid=False)
    l1_loss = gluon.loss.L1Loss()

    # metrics
    obj_metrics = mx.metric.Loss('ObjLoss')
    center_metrics = mx.metric.Loss('BoxCenterLoss')
    scale_metrics = mx.metric.Loss('BoxScaleLoss')
    cls_metrics = mx.metric.Loss('ClassLoss')

    # set up logger
    logging.basicConfig()
    logger = logging.getLogger()
    logger.setLevel(logging.INFO)
    log_file_path = args.save_prefix + '_train.log'
    log_dir = os.path.dirname(log_file_path)
    if log_dir and not os.path.exists(log_dir):
        os.makedirs(log_dir)
    fh = logging.FileHandler(log_file_path)
    logger.addHandler(fh)
    logger.info(args)
    logger.info('Start training from [Epoch {}]'.format(args.start_epoch))
    best_map = [0]
    for epoch in range(args.start_epoch, args.epochs):
        if args.mixup:
            # TODO(zhreshold): more elegant way to control mixup during runtime
            try:
                train_data._dataset.set_mixup(np.random.beta, 1.5, 1.5)
            except AttributeError:
                train_data._dataset._data.set_mixup(np.random.beta, 1.5, 1.5)
            if epoch >= args.epochs - args.no_mixup_epochs:
                try:
                    train_data._dataset.set_mixup(None)
                except AttributeError:
                    train_data._dataset._data.set_mixup(None)

        tic = time.time()
        btic = time.time()
        mx.nd.waitall()
        net.hybridize()
        for i, batch in enumerate(train_data):
            batch_size = batch[0].shape[0]
            data = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0)
            # objectness, center_targets, scale_targets, weights, class_targets
            fixed_targets = [gluon.utils.split_and_load(batch[it], ctx_list=ctx, batch_axis=0) for it in range(1, 6)]
            gt_boxes = gluon.utils.split_and_load(batch[6], ctx_list=ctx, batch_axis=0)
            sum_losses = []
            obj_losses = []
            center_losses = []
            scale_losses = []
            cls_losses = []
            with autograd.record():
                for ix, x in enumerate(data):
                    obj_loss, center_loss, scale_loss, cls_loss = net(x, gt_boxes[ix], *[ft[ix] for ft in fixed_targets])
                    sum_losses.append(obj_loss + center_loss + scale_loss + cls_loss)
                    obj_losses.append(obj_loss)
                    center_losses.append(center_loss)
                    scale_losses.append(scale_loss)
                    cls_losses.append(cls_loss)
                autograd.backward(sum_losses)
            lr_scheduler.update(i, epoch)
            trainer.step(batch_size)
            obj_metrics.update(0, obj_losses)
            center_metrics.update(0, center_losses)
            scale_metrics.update(0, scale_losses)
            cls_metrics.update(0, cls_losses)
            if args.log_interval and not (i + 1) % args.log_interval:
                name1, loss1 = obj_metrics.get()
                name2, loss2 = center_metrics.get()
                name3, loss3 = scale_metrics.get()
                name4, loss4 = cls_metrics.get()
                logger.info('[Epoch {}][Batch {}], LR: {:.2E}, Speed: {:.3f} samples/sec, {}={:.3f}, {}={:.3f}, {}={:.3f}, {}={:.3f}'.format(
                    epoch, i, trainer.learning_rate, batch_size/(time.time()-btic), name1, loss1, name2, loss2, name3, loss3, name4, loss4))
            btic = time.time()

        name1, loss1 = obj_metrics.get()
        name2, loss2 = center_metrics.get()
        name3, loss3 = scale_metrics.get()
        name4, loss4 = cls_metrics.get()
        logger.info('[Epoch {}] Training cost: {:.3f}, {}={:.3f}, {}={:.3f}, {}={:.3f}, {}={:.3f}'.format(
            epoch, (time.time()-tic), name1, loss1, name2, loss2, name3, loss3, name4, loss4))
        if not (epoch + 1) % args.val_interval:
            # consider reduce the frequency of validation to save time
            map_name, mean_ap = validate(net, val_data, ctx, eval_metric)
            val_msg = '\n'.join(['{}={}'.format(k, v) for k, v in zip(map_name, mean_ap)])
            logger.info('[Epoch {}] Validation: \n{}'.format(epoch, val_msg))
            current_map = float(mean_ap[-1])
        else:
            current_map = 0.
        save_params(net, best_map, current_map, epoch, args.save_interval, args.save_prefix)
Beispiel #12
0
def train(net, train_data, ctx, args):
    """Training pipeline"""
    net.collect_params().reset_ctx(ctx)
    if args.no_wd:
        for k, v in net.collect_params('.*beta|.*gamma|.*bias').items():
            v.wd_mult = 0.0

    if args.label_smooth:
        net._target_generator._label_smooth = True

    if args.lr_decay_period > 0:
        lr_decay_epoch = list(range(args.lr_decay_period, args.epochs, args.lr_decay_period))
    else:
        lr_decay_epoch = [int(i) for i in args.lr_decay_epoch.split(',')]
    lr_scheduler = LRScheduler(mode=args.lr_mode,
                               baselr=args.lr,
                               niters=args.num_samples // args.batch_size,
                               nepochs=args.epochs,
                               step=lr_decay_epoch,
                               step_factor=args.lr_decay, power=2,
                               warmup_epochs=args.warmup_epochs)

    if args.optimizer.lower() == 'adam':
        opt_name = 'adam'
        opt_param = {'wd': args.wd, 'lr_scheduler': lr_scheduler}
    elif args.optimizer.lower() == 'sgd':
        opt_name = 'sgd'
        opt_param = {'wd': args.wd, 'momentum': args.momentum, 'lr_scheduler': lr_scheduler}
    else:
        raise NotImplementedError(f'The optimizer {args.optimizer.lower()} is not implemented.')

    trainer = gluon.Trainer(net.collect_params(), opt_name, opt_param, kvstore='local')

    # metrics
    obj_metrics = mx.metric.Loss('O')
    center_metrics = mx.metric.Loss('BC')
    scale_metrics = mx.metric.Loss('BS')
    cls_metrics = mx.metric.Loss('C')
    coef_metrics = mx.metric.Loss('Cf')
    # set up logger
    logging.basicConfig()
    logger = logging.getLogger()
    logger.setLevel(logging.INFO)
    log_file_path = args.save_prefix + '_train.log'
    log_dir = os.path.dirname(log_file_path)
    if log_dir and not os.path.exists(log_dir):
        os.makedirs(log_dir)
    fh = logging.FileHandler(log_file_path)
    logger.addHandler(fh)
    logger.info(args)
    logger.info('Start training from [Epoch {}]'.format(args.start_epoch))
    for epoch in range(args.start_epoch, args.epochs):
        if args.mixup:
            # TODO(threshold): more elegant way to control mixup during runtime
            try:
                train_data._dataset.set_mixup(np.random.beta, 1.5, 1.5)
            except AttributeError:
                train_data._dataset._data.set_mixup(np.random.beta, 1.5, 1.5)
            if epoch >= args.epochs - args.no_mixup_epochs:
                try:
                    train_data._dataset.set_mixup(None)
                except AttributeError:
                    train_data._dataset._data.set_mixup(None)

        tic = time.time()
        btic = time.time()
        mx.nd.waitall()
        net.hybridize()
        for i, batch in enumerate(train_data):
            batch_size = batch[0].shape[0]
            data = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0)
            fixed_targets = [gluon.utils.split_and_load(batch[it], ctx_list=ctx, batch_axis=0) for it in range(1, 6)]
            gt_boxes = gluon.utils.split_and_load(batch[6], ctx_list=ctx, batch_axis=0)
            sum_losses = []
            obj_losses = []
            center_losses = []
            scale_losses = []
            cls_losses = []
            with autograd.record():
                for ix, x in enumerate(data):
                    obj_loss, center_loss, scale_loss, cls_loss = net(x, gt_boxes[ix],
                                                                      *[ft[ix] for ft in fixed_targets])
                    sum_losses.append(obj_loss + center_loss + scale_loss + cls_loss)
                    obj_losses.append(obj_loss)
                    center_losses.append(center_loss)
                    scale_losses.append(scale_loss)
                    cls_losses.append(cls_loss)
                autograd.backward(sum_losses)
            lr_scheduler.update(i, epoch)
            trainer.step(batch_size)
            obj_metrics.update(0, obj_losses)
            center_metrics.update(0, center_losses)
            scale_metrics.update(0, scale_losses)
            cls_metrics.update(0, cls_losses)
            if args.log_interval and not (i + 1) % args.log_interval:
                name1, loss1 = obj_metrics.get()
                name2, loss2 = center_metrics.get()
                name3, loss3 = scale_metrics.get()
                name4, loss4 = cls_metrics.get()
                logger.info('[E {}][B {}], LR: {:.2E}, {:.1f} S/s, {}={:.3f}, {}={:.3f}, {}={:.3f}, {}={:.3f}'.format(
                    epoch, i, trainer.learning_rate, batch_size / (time.time() - btic), name1, loss1, name2, loss2,
                    name3, loss3, name4, loss4))
            btic = time.time()

        name1, loss1 = obj_metrics.get()
        name2, loss2 = center_metrics.get()
        name3, loss3 = scale_metrics.get()
        name4, loss4 = cls_metrics.get()
        logger.info('[E {}] {:.1f} sec, {}={:.3f}, {}={:.3f}, {}={:.3f}, {}={:.3f}'.format(
            epoch, (time.time() - tic), name1, loss1, name2, loss2, name3, loss3, name4, loss4))
        save_params(net, epoch, args.save_interval, args.save_prefix)
Beispiel #13
0
def train_net(train_epoch,ctx,batch_size,data_dir,pre_trained_model,output_stride, \
              freeze_batch_norm,initial_learning_rate,weight_decay,base_architecture,aspp_or_vortex,resume):

    if base_architecture == 'resnet_v2_50':
        print('use resnet_v2_50')
        net = ResNet(BottleneckV2, [3, 4, 6, 3], [64, 256, 512, 1024, 2048],
                     output_stride, aspp_or_vortex)
    elif base_architecture == 'resnet_v2_101':
        print('use resnet_v2_101')
        net = ResNet(BottleneckV2, [3, 4, 23, 3], [64, 256, 512, 1024, 2048],
                     output_stride, aspp_or_vortex)

    if resume >= 0:
        print('resume for continue trainning')
        begin_epoch = resume + 1
        model_path = './checkpoint/deeplabv3_%s.params' % resume

        net.initialize(ctx=ctx)
        print('model_path', model_path)
        net.collect_params().load(model_path, ctx=ctx, restore_prefix='')

        if output_stride == 8:
            begin_epoch = resume + 1 - 46
        if freeze_batch_norm == 1:
            print('In the last 30K iters,freeze batchnorm ')
            net.collect_params(
                '.*gamma|.*beta|.*running_mean|.*running_var').setattr(
                    'grad_req', 'null')
    else:
        print('begin trainning')
        begin_epoch = 0
        print('before auto init')
        net.initialize(ctx=ctx)
        print('after auto init')
        net.load_params(pre_trained_model,
                        ctx=ctx,
                        allow_missing=True,
                        ignore_extra=True)

    loss = SoftmaxCrossEntropyLoss()
    #first 30K iter ,use split='train_aug',the last 30K iter use trainval.
    train_data = VOCSegDataset(root=data_dir, split='trainval')
    val_data = VOCSegDataset(root=data_dir, split='val')
    train_dataiter = gluon.data.DataLoader(train_data,
                                           batch_size=batch_size,
                                           shuffle=True,
                                           last_batch='discard')
    val_dataiter = gluon.data.DataLoader(val_data,
                                         batch_size=batch_size,
                                         last_batch='discard')
    lr_scheduler = LRScheduler(mode='poly',
                               baselr=initial_learning_rate,
                               niters=len(train_dataiter),
                               nepochs=train_epoch)
    trainer = gluon.Trainer(
        net.collect_params(), 'sgd', {
            'lr_scheduler': lr_scheduler,
            'wd': weight_decay,
            'momentum': 0.9,
            'multi_precision': True
        })
    for epoch in range(begin_epoch, train_epoch):
        train_loss, train_acc, meaniou, n, m = 0, 0, 0, 0, 0
        total_inter, total_union, total_correct, total_label = (0, ) * 4
        iter = 0
        for i, batch in enumerate(train_dataiter):
            data, label, batch_size = _get_batch(batch, ctx)
            lr_scheduler.update(i, epoch)

            with autograd.record():
                output = [net(x) for x in data]
                losses = [loss(yhat, y) for yhat, y in zip(output, label)]

            for l in losses:
                l.backward()
            trainer.step(batch_size)
            train_loss += sum([l.sum().asscalar() for l in losses])

            n += batch_size

            m += sum([y.size for y in label])
            #  evaluation
            correct, labeled = (0, ) * 2
            result_pix = [
                batch_pix_accuracy(output_, label_)
                for output_, label_ in zip(output, label)
            ]
            for i in range(len(result_pix)):
                correct += result_pix[i][0]
                labeled += result_pix[i][1]
            inter, union = (0, ) * 2
            result_iou = [
                batch_intersection_union(output_, label_, 21)
                for output_, label_ in zip(output, label)
            ]
            for i in range(len(result_iou)):
                inter += result_iou[i][0]
                union += result_iou[i][1]
            total_correct += correct.astype('int64')
            total_label += labeled.astype('int64')
            total_inter += inter.astype('int64')
            total_union += union.astype('int64')
            pix_acc = np.float64(1.0) * total_correct / (
                np.spacing(1, dtype=np.float64) + total_label)
            IoU = np.float64(1.0) * total_inter / (
                np.spacing(1, dtype=np.float64) + total_union)
            mIoU = IoU.mean()
            iter = iter + 1
            if iter % 10 == 0:
                print(
                    '-Epoch %s, Batch %d. Loss: %f, pix_acc: %.4f, mIoU: %.4f'
                    % (epoch, n, train_loss / n, pix_acc, mIoU))
        net.collect_params().save(filename='./checkpoint/deeplabv3_%s.params' %
                                  (epoch))

        val_pix_acc, val_mIoU = evaluate_accuracy(val_dataiter, net, ctx)
        print('val_pix_acc: %.4f, val_mIoU: %.4f' % (val_pix_acc, val_mIoU))